User:Harej/citation-watchlist-staging.js

Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump.
This code will be executed when previewing this page.
Documentation for this user script can be added at User:Harej/citation-watchlist-staging.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
/* Per-wiki configuration */

const LANGUAGE = 'en';
const FAMILY = 'wikipedia';
const actionApiEndpoint = `https://${LANGUAGE}.${FAMILY}.org/w/api.php`;
const restApiEndpoint = `https://api.wikimedia.org/core/v1`;
const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List";
const listOfLists = "Wikipedia:Citation_Watchlist/Lists";
const msgWarning = "Warning";
const msgCaution = "Caution";
const msgInspect = "Inspect";
const warnEmoji = '\u2757';
const cautionEmoji = '\u270B';
const inspectEmoji = '\uD83D\uDD0E';
const warnSectionHeader = "==Warn==";
const cautionSectionHeader = "==Caution==";
const inspectSectionHeader = "==Inspect==";
const delayMs = 50;
const maxRequestsPerHour = 400;


/*
Citation Watchlist Script – Highlights watchlist entries when questionable sources are added

author:  Hacks/Hackers
license: GPL 4.0
*/

let publicSuffixSet = new Set();
let warnList = new Set();
let cautionList = new Set();
let inspectList = new Set();
let lastRequestTime = 0;

// The Wikimedia REST API has a hard request limit of 500 per hour, and no clear
// way to batch these requests. As such, we need to track our requests, and to do
// so globally across the whole session (not just a single instantiation of the
// script.)

if (!localStorage.getItem('citationWatchlistRestApiRequestCount')) {
  localStorage.setItem('citationWatchlistRestApiRequestCount', '0');
}

setInterval(() => {
  localStorage.setItem('citationWatchlistRestApiRequestCount', '0');
  console.log("Request count reset");
}, 3600000);

function getRequestCount() {
  const count = parseInt(localStorage.getItem('citationWatchlistRestApiRequestCount'), 10);
  return isNaN(count) ? 0 : count;
}

function incrementRequestCount() {
  const currentCount = getRequestCount();
  localStorage.setItem('citationWatchlistRestApiRequestCount', (currentCount + 1).toString());
  console.log(`Request count incremented to ${currentCount + 1}`);
}

function prependEmojiWithTooltip(element, emoji, domains, tooltipText) {
  let processedType = '';
  if (emoji === warnEmoji) {
    processedType = 'warn';
  } else if (emoji === cautionEmoji) {
    processedType = 'caution';
  } else if (emoji === inspectEmoji) {
    processedType = 'inspect';
  } else {
    console.error('Unsupported emoji type');
    return;
  }

  if (element.getAttribute(`data-processed-${processedType}`) === 'true') {
    return;
  }

  const emojiSpan = document.createElement('span');
  emojiSpan.textContent = emoji + " ";
  emojiSpan.title = tooltipText + ": " + domains.join(", ");
  element.parentNode.insertBefore(emojiSpan, element);
  element.setAttribute(`data-processed-${processedType}`, 'true');
}

async function parseWatchlist() {
  // Select all containers of the watchlist links to process them individually
  const entriesContainers = document.querySelectorAll('.mw-changeslist-links');
  const revisions = [];
  const revisionIds = [];

  let linkCounter = 0;

  // Build map of previous revision IDs
  for (const container of entriesContainers) {
    const prevLink = container.querySelector('a.mw-history-histlinks-previous');
    let urlParams = '';
    if (prevLink) {
      urlParams = new URLSearchParams(prevLink.href);
      revisionIds.push(urlParams.get('oldid'));
    }
  }
  console.log(revisionIds);
  const previousRevisionMap = await fetchPreviousRevisionIds(revisionIds);

  for (const container of entriesContainers) {
    const diffLink = container.querySelector('a.mw-changeslist-diff');
    const histLink = container.querySelector('a.mw-changeslist-history');
    const prevLink = container.querySelector('a.mw-history-histlinks-previous');
    const curLink = container.querySelector('a.mw-history-histlinks-current');

    if (diffLink) {
      // First we are checking if we are in recent changes / watchlist.
      // If a "diff" link is found, process it
      linkCounter += 1;
      urlParams = new URLSearchParams(diffLink.href);
      revisions.push({
        oldrevision: urlParams.get('diff'),
        newrevision: urlParams.get('oldid'),
        element: diffLink.parentNode.parentNode
      });
    } else if (histLink) {
      // If no "diff" link is found but a "hist" link is, process the "hist" link
      linkCounter += 1;
      urlParams = new URLSearchParams(histLink.href);
      const pageID = urlParams.get('curid');
      const firstID = await fetchFirstRevisionId(pageID);
      revisions.push({
        oldrevision: firstID,
        element: histLink.parentNode.parentNode
      });
    } else if (prevLink) {
      // At this point, check if we are on a page history rather than watchlist
      linkCounter += 1;
      urlParams = new URLSearchParams(prevLink.href);
      revisions.push({
        oldrevision: urlParams.get('oldid'),
        newrevision: previousRevisionMap[urlParams.get('oldid')],
        element: prevLink.parentNode.parentNode
      });
    } else if (curLink) {
      // No prev link means we are at the page's first revision
      // We do not actually want to compare to the current revision. We extract
      // the oldid and treat like a new page.
      linkCounter += 1;
      urlParams = new URLSearchParams(curLink.href);
      revisions.push({
        oldrevision: urlParams.get('oldid'),
        element: curLink.parentNode.parentNode
      });
    }
  }

  // Finally, to get to this point, you are on a page history with only
  // one revision, and therefore no links of any kind. Extract first (and
  // only) revision ID from page title.
  if (linkCounter == 0) {
    const pageID = mw.config.get('wgArticleId');
    const firstID = await fetchFirstRevisionId(pageID);
    revisions.push({
      oldrevision: firstID,
      element: entriesContainers[0]
    });
  }

  return revisions;
}

function delay(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

async function buildURL(params) {
  const url = new URL(actionApiEndpoint);
  Object.keys(params).forEach(key => url.searchParams.append(key, params[key]));
  return url;
}

function getRootDomain(hostname, publicSuffixSet) {
  const domainParts = hostname.split('.');
  for (let i = 0; i < domainParts.length; i++) {
    const candidate = domainParts.slice(i).join('.');
    if (publicSuffixSet.has(candidate) || publicSuffixSet.has(`!${candidate}`)) {
      return domainParts.slice(i - 1).join('.');
    }
  }
  return hostname;
}

function extractAddedURLs(addedParts) {
  const addedURLs = [];
  const urlRegex = /https?:\/\/[^\s<"]+/g;
  let match;
  while ((match = urlRegex.exec(addedParts)) !== null) {
    try {
      const url = new URL(match[0]);
      addedURLs.push(url.href);
    } catch (error) {
      console.error(`Invalid URL rejected: ${match[0]}`);
    }
  }
  return addedURLs;
}

async function fetchFromActionAPI(params) {
  const url = await buildURL(params);
  console.log(`Action API request: ${url}`);

  const now = Date.now();
  const elapsed = now - lastRequestTime;
  if (elapsed < delayMs) {
    await delay(delayMs - elapsed);
  }

  lastRequestTime = Date.now();

  try {
    const response = await fetch(url);
    if (!response.ok) {
      throw new Error(`Network response was not ok: ${response.statusText}`);
    }
    return await response.json();
  } catch (error) {
    console.error('Error fetching data from MediaWiki API:', error);
    throw error;
  }
}

async function fetchPublicSuffixList() {
  const pslUrl = `https://${LANGUAGE}.${FAMILY}.org/wiki/${publicSuffixList}?action=raw`;
  console.log(`Raw page text request: ${pslUrl}`);
  try {
    const response = await fetch(pslUrl);
    const content = await response.text();
    const suffixSet = new Set();
    const lines = content.split('\n');
    for (const line of lines) {
      if (line.trim() && !line.trim().startsWith('//')) {
        suffixSet.add(line.trim());
      }
    }
    return suffixSet;
  } catch (error) {
    console.error("Error fetching Public Suffix List:", error);
    return new Set();
  }
}

async function fetchDiffFromAPI(apiUrl) {
  if (getRequestCount() >= maxRequestsPerHour) {
    console.warn("Request limit reached, waiting for reset...");
    await delay(3600000); // Wait for an hour if the limit is reached
  }

  incrementRequestCount();
  console.log(`Diff API request: ${apiUrl} (Request count: ${getRequestCount()})`);
  try {
    const response = await fetch(apiUrl);
    const data = await response.json();
    return data["source"] || data["diff"];
  } catch (error) {
    console.error('Error fetching API content:', error);
    return null;
  }
}

async function fetchDiffAndProcess(revisions) {
  for (const revision of revisions) {
    let apiUrl = `${restApiEndpoint}/${FAMILY}/${LANGUAGE}/revision/${revision.oldrevision}`;
    if (revision.newrevision !== undefined) {
      apiUrl += `/compare/${revision.newrevision}`;
    }
    const diff = await fetchDiffFromAPI(apiUrl);
    let addedURLs = [];

    if (Array.isArray(diff)) { // actual diffs are arrays; new pages are strings
      // Types 2 and 4 represent "from".
      // Types 1 and 5 represent "to".
      // Type 3 represents changes within a line. It will be harder to extract URL changes in this case.
      let fromURLs = [];
      let toURLs = [];

      for (const diffLine of diff) {
        const lineURLs = extractAddedURLs(diffLine.text);
        for (const URL of lineURLs) {
          if (diffLine.type === 2 || diffLine.type === 4) {
            fromURLs.push(URL);
          } else if (diffLine.type === 1 || diffLine.type === 5) {
            toURLs.push(URL);
          }
        }
      }

      const toURLSet = new Set(toURLs);
      addedURLs = fromURLs.filter(url => !toURLSet.has(url));
    } else {
      addedURLs = extractAddedURLs(diff);
    }

    console.log(`Old revision: ${revision.oldrevision}
    New revision: ${revision.newrevision}
    API URL: ${apiUrl}
    Revision element: ${revision.element.innerHTML}
    Added URLs: ${addedURLs.join(' ')}
    `);

    const matchedWarnDomains = [];
    const matchedCautionDomains = [];
    const matchedInspectDomains = [];

    for (const url of addedURLs) {
      const hostname = new URL(url).hostname;
      const domain = getRootDomain(hostname, publicSuffixSet);

      if (warnList.has(domain) && !matchedWarnDomains.includes(domain)) {
        matchedWarnDomains.push(domain);
      } else if (cautionList.has(domain) && !matchedCautionDomains.includes(domain)) {
        matchedCautionDomains.push(domain);
      } else if (inspectList.has(domain) && !matchedInspectDomains.includes(domain)) {
        matchedInspectDomains.push(domain);
      }
    }

    if (matchedWarnDomains.length > 0) {
      prependEmojiWithTooltip(revision.element, warnEmoji, matchedWarnDomains, msgWarning);
    }
    if (matchedCautionDomains.length > 0) {
      prependEmojiWithTooltip(revision.element, cautionEmoji, matchedCautionDomains, msgCaution);
    }
    if (matchedInspectDomains.length > 0) {
      prependEmojiWithTooltip(revision.element, inspectEmoji, matchedInspectDomains, msgInspect);
    }
  }
}

async function fetchAndOrganizeDomainLists(pageNames) {
  const params = {
    action: 'query',
    prop: 'revisions',
    titles: pageNames.join('|'), // Join all page names
    rvprop: 'content',
    rvslots: '*',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    
    const warnList = new Set();
    const cautionList = new Set();
    const inspectList = new Set();

    for (const pageId in pages) {
      const content = pages[pageId].revisions[0].slots.main['*'];
      let currentList = null;

      const lines = content.split('\n');
      for (let line of lines) {
        if (line.trim() === warnSectionHeader) {
          currentList = warnList;
        } else if (line.trim() === cautionSectionHeader) {
          currentList = cautionList;
        } else if (line.trim() === inspectSectionHeader) {
          currentList = inspectList;
        }

        if (line.startsWith('*') && currentList) {
          const domain = line.substring(1).trim();
          currentList.add(domain);
        }
      }
    }

    return {
      warnList,
      cautionList,
      inspectList
    };
  } catch (error) {
    console.error('Error fetching or parsing the page content:', error);
    throw error;
  }
}

async function fetchPreviousRevisionIds(revisionIds) {
  const params = {
    action: 'query',
    prop: 'revisions',
    revids: revisionIds.join('|'), // join all revision IDs
    rvprop: 'ids',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    const revisionMap = {};
    for (const pageId in pages) {
      const revisions = pages[pageId].revisions;
      if (revisions && revisions.length > 0) {
        for (const revision of revisions) {
          revisionMap[revision.revid] = revision.parentid;
        }
      }
    }
    return revisionMap;
  } catch (error) {
    console.error('Error fetching previous revision IDs:', error);
    return {};
  }
}

async function fetchFirstRevisionId(pageID) {
  const params = {
    action: 'query',
    pageids: pageID,
    prop: 'revisions',
    rvlimit: 1,
    rvdir: 'newer',
    format: 'json',
    origin: '*'
  };

  try {
    const data = await fetchFromActionAPI(params);
    const pages = data.query.pages;
    const pageId = Object.keys(pages)[0];
    const revisions = pages[pageId].revisions;

    if (revisions && revisions.length > 0) {
      return revisions[0].revid;
    } else {
      throw new Error('No revisions found for this page.');
    }
  } catch (error) {
    console.error('Error fetching first revision ID:', error);
    return null;
  }
}

async function fetchDomainListPages(pageName) {
  const cacheKey = `citationWatchlistFetchDomainListPages_${pageName}`;
  const cacheExpiration = 4 * 60 * 60 * 1000; // 4 hours in milliseconds
  const now = Date.now();
  const cachedData = localStorage.getItem(cacheKey);
  const cachedTimestamp = localStorage.getItem(`${cacheKey}_timestamp`);
  if (cachedData && cachedTimestamp && (now - parseInt(cachedTimestamp, 10)) < cacheExpiration) {
    console.log("Loaded list of lists from cache");
    return JSON.parse(cachedData);
  } else {
    const params = {
      action: 'query',
      prop: 'revisions',
      titles: pageName,
      rvprop: 'content',
      rvslots: '*',
      format: 'json',
      origin: '*'
    };
    try {
      const data = await fetchFromActionAPI(params);
      const page = data.query.pages;
      const pageId = Object.keys(page)[0];
      const content = page[pageId].revisions[0].slots.main['*'];
      const pageTitles = [];
      const lines = content.split('\n');
      for (let line of lines) {
        if (line.startsWith('* [[')) {
          const match = line.match(/\[\[([^\]]+)\]\]/); // Matches the first instance of [[Page Title]]
          if (match) {
            pageTitles.push(match[1]);
          }
        }
      }
      localStorage.setItem(cacheKey, JSON.stringify(pageTitles));
      localStorage.setItem(`${cacheKey}_timestamp`, now.toString());
      console.log("Loaded from API and stored in cache");
      return pageTitles;
    } catch (error) {
      console.error('Error fetching or parsing the page content:', error);
      throw error;
    }
  }
}

async function runScript() {
  publicSuffixSet = await fetchPublicSuffixList();
  if (publicSuffixSet.size === 0) {
    console.error('Public Suffix List loading failed');
    return;
  }
  console.log("Welcome to Citation Watchlist");
  const listPages = await fetchDomainListPages(listOfLists);
  try {
    const lists = await fetchAndOrganizeDomainLists(listPages);
    lists.warnList.forEach(warnList.add, warnList);
    lists.cautionList.forEach(cautionList.add, cautionList);
    lists.inspectList.forEach(inspectList.add, inspectList);
  } catch (error) {
    console.error('Error fetching domain lists:', error);
  }
  const watchlistRevisions = await parseWatchlist();
  await fetchDiffAndProcess(watchlistRevisions);
}

runScript().then(() => console.log('Citation Watchlist script finished executing'));