Jump to content

User:Polygnotus/Scripts/DeduplicateReferences.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// <nowiki>
//Only exact duplicates
//Tries to come up with a name for the reference

// Function to deduplicate references in Wikipedia articles
function deduplicateReferences() {
  // Get the edit textarea and summary input
  const editTextarea = document.getElementById('wpTextbox1');
  const summaryInput = document.getElementById('wpSummary');
  if (!editTextarea || !summaryInput) return;

  let content = editTextarea.value;
  
  // Regular expression to match <ref> tags
  const refRegex = /<ref[^>]*>[\s\S]*?<\/ref>/gi;
  
  // Object to store all references
  const allRefs = {};
  
  // Set to store all used reference names
  const usedNames = new Set();
  
  // Blacklist of reference names to ignore
  const blacklist = [
    "doi_org",
    "jstor_org",
    "amazon_com",
    "books_google_com",
    "web_archive_org",
    "worldcat_org",
    "dx_doi_org"
    // Add more blacklisted names here
  ];
  
  // Function to extract domain name from URL
  function extractDomain(url) {
    try {
      let domain = new URL(url).hostname;
      domain = domain.replace(/^www\./, '');  // Remove 'www.' if present
      return domain === 'archive.org' ? extractDomain(url.split('archive.org/web/')[1]) : domain;
    } catch (e) {
      return null;
    }
  }
  
  // Function to generate a unique name for the reference
  function generateUniqueName(ref) {
    const urlMatch = ref.match(/https?:\/\/[^\s<>"]+/i);
    if (urlMatch) {
      const domain = extractDomain(urlMatch[0]);
      if (domain) {
        let baseName = domain.replace(/\./g, '_');
        let uniqueName = baseName;
        let counter = 1;
        while (usedNames.has(uniqueName)) {
          uniqueName = `${baseName}_${counter}`;
          counter++;
        }
        usedNames.add(uniqueName);
        return uniqueName;
      }
    }
    return null;
  }
  
  // Function to extract existing name from a reference
  function extractExistingName(ref) {
    const nameMatch = ref.match(/name\s*=\s*(["']?)([^"'\s/>]+(?:\s+[^"'\s/>]+)*)\1/i);
    return nameMatch ? nameMatch[2] : null;
  }
  
  // Function to create a reference tag
  function createRefTag(name, content = null) {
    if (content) {
      return `<ref name="${name}">${content}</ref>`;
    } else {
      return `<ref name="${name}" />`;
    }
  }
  
  // Function to check if a reference is blacklisted
  function isBlacklisted(ref) {
    const name = extractExistingName(ref);
    return name && blacklist.includes(name);
  }
  
  // First pass: collect all references and used names
  content.replace(refRegex, (match) => {
    if (!isBlacklisted(match)) {
      const existingName = extractExistingName(match);
      if (existingName) {
        usedNames.add(existingName);
      }
      if (allRefs[match]) {
        allRefs[match].count++;
      } else {
        allRefs[match] = { count: 1, name: existingName, firstOccurrence: match };
      }
    }
    return match;
  });
  
  // Second pass: replace duplicates with named references
  let deduplicatedCount = 0;
  content = content.replace(refRegex, (match) => {
    if (isBlacklisted(match)) {
      return match; // Return blacklisted references unchanged
    }
    if (allRefs[match] && allRefs[match].count > 1) {
      if (!allRefs[match].name) {
        // This is a duplicate without a name
        const generatedName = generateUniqueName(match);
        if (generatedName && !blacklist.includes(generatedName)) {
          allRefs[match].name = generatedName;
          allRefs[match].firstOccurrence = createRefTag(generatedName, match.match(/<ref[^>]*>([\s\S]*)<\/ref>/)[1]);
          return allRefs[match].firstOccurrence;
        }
      } else {
        // This is a named reference
        if (match === allRefs[match].firstOccurrence) {
          // This is the first occurrence, keep it as is
          return match;
        } else {
          // This is a subsequent occurrence, replace with short form
          deduplicatedCount++;
          return createRefTag(allRefs[match].name);
        }
      }
    }
    return match;  // Return unchanged for non-duplicates or blacklisted references
  });
  
  // Update the textarea with the deduplicated content
  if (deduplicatedCount > 0) {
    editTextarea.value = content;
    
    // Add edit summary
    let currentSummary = summaryInput.value;
    let deduplicationSummary = `Deduplicated ${deduplicatedCount} reference${deduplicatedCount > 1 ? 's' : ''}`;
    summaryInput.value = currentSummary ? `${currentSummary}${deduplicationSummary}` : deduplicationSummary;
    document.editform.wpMinoredit.checked = true;
  }
}

// Function to check if the edit textarea is ready
function isEditTextareaReady() {
  const editTextarea = document.getElementById('wpTextbox1');
  const summaryInput = document.getElementById('wpSummary');
  return editTextarea && editTextarea.value && summaryInput;
}

// Function to run deduplication when everything is ready
function runDeduplicationWhenReady() {
  if (isEditTextareaReady()) {
    deduplicateReferences();
  } else {
    // If not ready, check again after a short delay
    setTimeout(runDeduplicationWhenReady, 100);
  }
}

// Run the deduplication when the edit page is fully loaded
if (mw.config.get('wgAction') === 'edit') {
  if (document.readyState === 'complete') {
    runDeduplicationWhenReady();
  } else {
    window.addEventListener('load', runDeduplicationWhenReady);
  }
}


// </nowiki>