User:Harej/citation-watchlist-staging.js
Appearance
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
Documentation for this user script can be added at User:Harej/citation-watchlist-staging. |
/* Per-wiki configuration */
const LANGUAGE = 'en';
const FAMILY = 'wikipedia';
const actionApiEndpoint = `https://${LANGUAGE}.${FAMILY}.org/w/api.php`;
const restApiEndpoint = `https://api.wikimedia.org/core/v1`;
const publicSuffixList = "Wikipedia:Citation_Watchlist/Public_Suffix_List";
const listOfLists = "Wikipedia:Citation_Watchlist/Lists";
const msgWarning = "Warning";
const msgCaution = "Caution";
const msgInspect = "Inspect";
const warnEmoji = '\u2757';
const cautionEmoji = '\u270B';
const inspectEmoji = '\uD83D\uDD0E';
const warnSectionHeader = "==Warn==";
const cautionSectionHeader = "==Caution==";
const inspectSectionHeader = "==Inspect==";
const delayMs = 50;
const maxRequestsPerHour = 400;
/*
Citation Watchlist Script – Highlights watchlist entries when questionable sources are added
author: Hacks/Hackers
license: GPL 4.0
*/
let publicSuffixSet = new Set();
let warnList = new Set();
let cautionList = new Set();
let inspectList = new Set();
let lastRequestTime = 0;
// The Wikimedia REST API has a hard request limit of 500 per hour, and no clear
// way to batch these requests. As such, we need to track our requests, and to do
// so globally across the whole session (not just a single instantiation of the
// script.)
if (!localStorage.getItem('citationWatchlistRestApiRequestCount')) {
localStorage.setItem('citationWatchlistRestApiRequestCount', '0');
}
setInterval(() => {
localStorage.setItem('citationWatchlistRestApiRequestCount', '0');
console.log("Request count reset");
}, 3600000);
function getRequestCount() {
const count = parseInt(localStorage.getItem('citationWatchlistRestApiRequestCount'), 10);
return isNaN(count) ? 0 : count;
}
function incrementRequestCount() {
const currentCount = getRequestCount();
localStorage.setItem('citationWatchlistRestApiRequestCount', (currentCount + 1).toString());
console.log(`Request count incremented to ${currentCount + 1}`);
}
function prependEmojiWithTooltip(element, emoji, domains, tooltipText) {
let processedType = '';
if (emoji === warnEmoji) {
processedType = 'warn';
} else if (emoji === cautionEmoji) {
processedType = 'caution';
} else if (emoji === inspectEmoji) {
processedType = 'inspect';
} else {
console.error('Unsupported emoji type');
return;
}
if (element.getAttribute(`data-processed-${processedType}`) === 'true') {
return;
}
const emojiSpan = document.createElement('span');
emojiSpan.textContent = emoji + " ";
emojiSpan.title = tooltipText + ": " + domains.join(", ");
element.parentNode.insertBefore(emojiSpan, element);
element.setAttribute(`data-processed-${processedType}`, 'true');
}
async function parseWatchlist() {
// Select all containers of the watchlist links to process them individually
const entriesContainers = document.querySelectorAll('.mw-changeslist-links');
const revisions = [];
const revisionIds = [];
let linkCounter = 0;
// Build map of previous revision IDs
for (const container of entriesContainers) {
const prevLink = container.querySelector('a.mw-history-histlinks-previous');
let urlParams = '';
if (prevLink) {
urlParams = new URLSearchParams(prevLink.href);
revisionIds.push(urlParams.get('oldid'));
}
}
console.log(revisionIds);
const previousRevisionMap = await fetchPreviousRevisionIds(revisionIds);
for (const container of entriesContainers) {
const diffLink = container.querySelector('a.mw-changeslist-diff');
const histLink = container.querySelector('a.mw-changeslist-history');
const prevLink = container.querySelector('a.mw-history-histlinks-previous');
const curLink = container.querySelector('a.mw-history-histlinks-current');
if (diffLink) {
// First we are checking if we are in recent changes / watchlist.
// If a "diff" link is found, process it
linkCounter += 1;
urlParams = new URLSearchParams(diffLink.href);
revisions.push({
oldrevision: urlParams.get('diff'),
newrevision: urlParams.get('oldid'),
element: diffLink.parentNode.parentNode
});
} else if (histLink) {
// If no "diff" link is found but a "hist" link is, process the "hist" link
linkCounter += 1;
urlParams = new URLSearchParams(histLink.href);
const pageID = urlParams.get('curid');
const firstID = await fetchFirstRevisionId(pageID);
revisions.push({
oldrevision: firstID,
element: histLink.parentNode.parentNode
});
} else if (prevLink) {
// At this point, check if we are on a page history rather than watchlist
linkCounter += 1;
urlParams = new URLSearchParams(prevLink.href);
revisions.push({
oldrevision: urlParams.get('oldid'),
newrevision: previousRevisionMap[urlParams.get('oldid')],
element: prevLink.parentNode.parentNode
});
} else if (curLink) {
// No prev link means we are at the page's first revision
// We do not actually want to compare to the current revision. We extract
// the oldid and treat like a new page.
linkCounter += 1;
urlParams = new URLSearchParams(curLink.href);
revisions.push({
oldrevision: urlParams.get('oldid'),
element: curLink.parentNode.parentNode
});
}
}
// Finally, to get to this point, you are on a page history with only
// one revision, and therefore no links of any kind. Extract first (and
// only) revision ID from page title.
if (linkCounter == 0) {
const pageID = mw.config.get('wgArticleId');
const firstID = await fetchFirstRevisionId(pageID);
revisions.push({
oldrevision: firstID,
element: entriesContainers[0]
});
}
return revisions;
}
function delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function buildURL(params) {
const url = new URL(actionApiEndpoint);
Object.keys(params).forEach(key => url.searchParams.append(key, params[key]));
return url;
}
function getRootDomain(hostname, publicSuffixSet) {
const domainParts = hostname.split('.');
for (let i = 0; i < domainParts.length; i++) {
const candidate = domainParts.slice(i).join('.');
if (publicSuffixSet.has(candidate) || publicSuffixSet.has(`!${candidate}`)) {
return domainParts.slice(i - 1).join('.');
}
}
return hostname;
}
function extractAddedURLs(addedParts) {
const addedURLs = [];
const urlRegex = /https?:\/\/[^\s<"]+/g;
let match;
while ((match = urlRegex.exec(addedParts)) !== null) {
try {
const url = new URL(match[0]);
addedURLs.push(url.href);
} catch (error) {
console.error(`Invalid URL rejected: ${match[0]}`);
}
}
return addedURLs;
}
async function fetchFromActionAPI(params) {
const url = await buildURL(params);
console.log(`Action API request: ${url}`);
const now = Date.now();
const elapsed = now - lastRequestTime;
if (elapsed < delayMs) {
await delay(delayMs - elapsed);
}
lastRequestTime = Date.now();
try {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`Network response was not ok: ${response.statusText}`);
}
return await response.json();
} catch (error) {
console.error('Error fetching data from MediaWiki API:', error);
throw error;
}
}
async function fetchPublicSuffixList() {
const pslUrl = `https://${LANGUAGE}.${FAMILY}.org/wiki/${publicSuffixList}?action=raw`;
console.log(`Raw page text request: ${pslUrl}`);
try {
const response = await fetch(pslUrl);
const content = await response.text();
const suffixSet = new Set();
const lines = content.split('\n');
for (const line of lines) {
if (line.trim() && !line.trim().startsWith('//')) {
suffixSet.add(line.trim());
}
}
return suffixSet;
} catch (error) {
console.error("Error fetching Public Suffix List:", error);
return new Set();
}
}
async function fetchDiffFromAPI(apiUrl) {
if (getRequestCount() >= maxRequestsPerHour) {
console.warn("Request limit reached, waiting for reset...");
await delay(3600000); // Wait for an hour if the limit is reached
}
incrementRequestCount();
console.log(`Diff API request: ${apiUrl} (Request count: ${getRequestCount()})`);
try {
const response = await fetch(apiUrl);
const data = await response.json();
return data["source"] || data["diff"];
} catch (error) {
console.error('Error fetching API content:', error);
return null;
}
}
async function fetchDiffAndProcess(revisions) {
for (const revision of revisions) {
let apiUrl = `${restApiEndpoint}/${FAMILY}/${LANGUAGE}/revision/${revision.oldrevision}`;
if (revision.newrevision !== undefined) {
apiUrl += `/compare/${revision.newrevision}`;
}
const diff = await fetchDiffFromAPI(apiUrl);
let addedURLs = [];
if (Array.isArray(diff)) { // actual diffs are arrays; new pages are strings
// Types 2 and 4 represent "from".
// Types 1 and 5 represent "to".
// Type 3 represents changes within a line. It will be harder to extract URL changes in this case.
let fromURLs = [];
let toURLs = [];
for (const diffLine of diff) {
const lineURLs = extractAddedURLs(diffLine.text);
for (const URL of lineURLs) {
if (diffLine.type === 2 || diffLine.type === 4) {
fromURLs.push(URL);
} else if (diffLine.type === 1 || diffLine.type === 5) {
toURLs.push(URL);
}
}
}
const toURLSet = new Set(toURLs);
addedURLs = fromURLs.filter(url => !toURLSet.has(url));
} else {
addedURLs = extractAddedURLs(diff);
}
console.log(`Old revision: ${revision.oldrevision}
New revision: ${revision.newrevision}
API URL: ${apiUrl}
Revision element: ${revision.element.innerHTML}
Added URLs: ${addedURLs.join(' ')}
`);
const matchedWarnDomains = [];
const matchedCautionDomains = [];
const matchedInspectDomains = [];
for (const url of addedURLs) {
const hostname = new URL(url).hostname;
const domain = getRootDomain(hostname, publicSuffixSet);
if (warnList.has(domain) && !matchedWarnDomains.includes(domain)) {
matchedWarnDomains.push(domain);
} else if (cautionList.has(domain) && !matchedCautionDomains.includes(domain)) {
matchedCautionDomains.push(domain);
} else if (inspectList.has(domain) && !matchedInspectDomains.includes(domain)) {
matchedInspectDomains.push(domain);
}
}
if (matchedWarnDomains.length > 0) {
prependEmojiWithTooltip(revision.element, warnEmoji, matchedWarnDomains, msgWarning);
}
if (matchedCautionDomains.length > 0) {
prependEmojiWithTooltip(revision.element, cautionEmoji, matchedCautionDomains, msgCaution);
}
if (matchedInspectDomains.length > 0) {
prependEmojiWithTooltip(revision.element, inspectEmoji, matchedInspectDomains, msgInspect);
}
}
}
async function fetchAndOrganizeDomainLists(pageNames) {
const params = {
action: 'query',
prop: 'revisions',
titles: pageNames.join('|'), // Join all page names
rvprop: 'content',
rvslots: '*',
format: 'json',
origin: '*'
};
try {
const data = await fetchFromActionAPI(params);
const pages = data.query.pages;
const warnList = new Set();
const cautionList = new Set();
const inspectList = new Set();
for (const pageId in pages) {
const content = pages[pageId].revisions[0].slots.main['*'];
let currentList = null;
const lines = content.split('\n');
for (let line of lines) {
if (line.trim() === warnSectionHeader) {
currentList = warnList;
} else if (line.trim() === cautionSectionHeader) {
currentList = cautionList;
} else if (line.trim() === inspectSectionHeader) {
currentList = inspectList;
}
if (line.startsWith('*') && currentList) {
const domain = line.substring(1).trim();
currentList.add(domain);
}
}
}
return {
warnList,
cautionList,
inspectList
};
} catch (error) {
console.error('Error fetching or parsing the page content:', error);
throw error;
}
}
async function fetchPreviousRevisionIds(revisionIds) {
const params = {
action: 'query',
prop: 'revisions',
revids: revisionIds.join('|'), // join all revision IDs
rvprop: 'ids',
format: 'json',
origin: '*'
};
try {
const data = await fetchFromActionAPI(params);
const pages = data.query.pages;
const revisionMap = {};
for (const pageId in pages) {
const revisions = pages[pageId].revisions;
if (revisions && revisions.length > 0) {
for (const revision of revisions) {
revisionMap[revision.revid] = revision.parentid;
}
}
}
return revisionMap;
} catch (error) {
console.error('Error fetching previous revision IDs:', error);
return {};
}
}
async function fetchFirstRevisionId(pageID) {
const params = {
action: 'query',
pageids: pageID,
prop: 'revisions',
rvlimit: 1,
rvdir: 'newer',
format: 'json',
origin: '*'
};
try {
const data = await fetchFromActionAPI(params);
const pages = data.query.pages;
const pageId = Object.keys(pages)[0];
const revisions = pages[pageId].revisions;
if (revisions && revisions.length > 0) {
return revisions[0].revid;
} else {
throw new Error('No revisions found for this page.');
}
} catch (error) {
console.error('Error fetching first revision ID:', error);
return null;
}
}
async function fetchDomainListPages(pageName) {
const cacheKey = `citationWatchlistFetchDomainListPages_${pageName}`;
const cacheExpiration = 4 * 60 * 60 * 1000; // 4 hours in milliseconds
const now = Date.now();
const cachedData = localStorage.getItem(cacheKey);
const cachedTimestamp = localStorage.getItem(`${cacheKey}_timestamp`);
if (cachedData && cachedTimestamp && (now - parseInt(cachedTimestamp, 10)) < cacheExpiration) {
console.log("Loaded list of lists from cache");
return JSON.parse(cachedData);
} else {
const params = {
action: 'query',
prop: 'revisions',
titles: pageName,
rvprop: 'content',
rvslots: '*',
format: 'json',
origin: '*'
};
try {
const data = await fetchFromActionAPI(params);
const page = data.query.pages;
const pageId = Object.keys(page)[0];
const content = page[pageId].revisions[0].slots.main['*'];
const pageTitles = [];
const lines = content.split('\n');
for (let line of lines) {
if (line.startsWith('* [[')) {
const match = line.match(/\[\[([^\]]+)\]\]/); // Matches the first instance of [[Page Title]]
if (match) {
pageTitles.push(match[1]);
}
}
}
localStorage.setItem(cacheKey, JSON.stringify(pageTitles));
localStorage.setItem(`${cacheKey}_timestamp`, now.toString());
console.log("Loaded from API and stored in cache");
return pageTitles;
} catch (error) {
console.error('Error fetching or parsing the page content:', error);
throw error;
}
}
}
async function runScript() {
publicSuffixSet = await fetchPublicSuffixList();
if (publicSuffixSet.size === 0) {
console.error('Public Suffix List loading failed');
return;
}
console.log("Welcome to Citation Watchlist");
const listPages = await fetchDomainListPages(listOfLists);
try {
const lists = await fetchAndOrganizeDomainLists(listPages);
lists.warnList.forEach(warnList.add, warnList);
lists.cautionList.forEach(cautionList.add, cautionList);
lists.inspectList.forEach(inspectList.add, inspectList);
} catch (error) {
console.error('Error fetching domain lists:', error);
}
const watchlistRevisions = await parseWatchlist();
await fetchDiffAndProcess(watchlistRevisions);
}
runScript().then(() => console.log('Citation Watchlist script finished executing'));