User:Trey314159/homoglyphHunter.js
Appearance
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
Documentation for this user script can be added at User:Trey314159/homoglyphHunter. |
// Latin-to-Cyrillic mapping
var Lat2CyrMap = {
'a':'а', 'A':'А', 'ă':'ӑ', 'Ă':'Ӑ', 'ä':'ӓ', 'Ä':'Ӓ', 'æ':'ӕ', 'Æ':'Ӕ', 'B':'В', 'c':'с', 'C':'С', 'ç':'ҫ', 'Ç':'Ҫ', 'e':'е', 'E':'Е', 'è':'ѐ', 'È':'Ѐ', 'ë':'ё', 'Ë':'Ё', 'ĕ':'ӗ', 'Ĕ':'Ӗ', 'ə':'ә', 'Ə':'Ә', 'H':'Н', 'i':'і', 'I':'І', 'ï':'ї', 'Ï':'Ї', 'j':'ј', 'J':'Ј', 'k':'к', 'K':'К', 'M':'М', 'o':'о', 'O':'О', 'ö':'ӧ', 'Ö':'Ӧ', 'p':'р', 'P':'Р', 'Q':'Ԛ', 's':'ѕ', 'S':'Ѕ', 'T':'Т', 'W':'Ԝ', 'x':'х', 'X':'Х', 'y':'у', 'Y':'У', 'ȳ':'ӯ', 'ÿ':'ӱ', 'á':'а́', 'é':'е́', 'í':'і́', 'ó':'о́', 'ý':'у́', 'ħ':'ћ', 'ɜ':'з' };
var EncErrMap = {'ц':'ö', 'ч':'ç', 'у':'ã', 'б':'á', 'ж':'æ'};
var Cyr2LatMap = {};
// invert Lat2CyrMap to Cyr2LatMap and strip keys of length > 1 in both directions
invertAndLengthFilter(Lat2CyrMap, Cyr2LatMap);
// define patterns and regexes for matching all chars in script, or just homoglyphs
var LatAllPat = 'A-Za-zÀ-ɏɐ-ʯ';
var LatHomoglyphPat = Object.keys(Lat2CyrMap).join('');
var LatAllRegex = new RegExp("[" + LatAllPat + "]+", "g");
var LatOneRegex = new RegExp("[" + LatAllPat + "]", "g");
var LatHomoglyphRegex = new RegExp("[" + LatHomoglyphPat + "]+", "g");
var CyrAllPat = 'Ѐ-ԯ';
var CyrHomoglyphPat = Object.keys(Cyr2LatMap).join('');
var CyrAllRegex = new RegExp("[" + CyrAllPat + "]+", "g");
var CyrOneRegex = new RegExp("[" + CyrAllPat + "]", "g");
var CyrHomoglyphRegex = new RegExp("[" + CyrHomoglyphPat + "]+", "g");
var insourcePat = "/[" + CyrAllPat + LatAllPat + "]*([" + CyrAllPat + "][" + LatAllPat + "]|[" + LatAllPat + "][" + CyrAllPat + "])[" + CyrAllPat + LatAllPat + "]*/";
// Config
var viceversa = 1;
var sortbyscore = 1;
var limitresults = 50;
var slowFetch = 0;
var startTime = '';
var letsGo = "\
<b>Options:</b><br>\
<ul><li>Looking for <b>Latin</b> words with <i>Cyrillic</i> characters.</li>\
<li><a href='#' onclick='viceversa=1-viceversa; $(\"#FHOptViceVersa\").html(viceversa?\"Also show\":\"Skip\");'>Vice Versa</a>: <b><div style='display:inline' id='FHOptViceVersa'>[Wait for it...]</div></b> predominantly Cyrillic words.</li>\
<li><a href='#' onclick='sortbyscore=1-sortbyscore; $(\"#FHOptSort\").html(sortbyscore?\"magic score\":\"raw results count\");'>Sort</a>: Sort by <b><div style='display:inline' id='FHOptSort'>[wait for it...]</div></b>. (Magic score puts impactful, more obviously correctable results first.)</li>\
</ul><br>\
<a href='#' onclick='findHomoglyphs();'>Let's go</a>!";
function initialize_HHunter() {
if ($("#HHContainer").length === 0){
var div = document.createElement('div');
div.setAttribute('id', 'HHContainer');
var dstyle = div.style;
dstyle.position = 'fixed';
dstyle.width = "90%";
dstyle.height = "90%";
dstyle.top = "3%";
dstyle.left = "5%";
dstyle.margin = "0";
dstyle.zIndex = "1000000";
dstyle.backgroundColor = "#fefefe";
dstyle.border = "1px solid #aaa";
dstyle.overflow = "scroll";
dstyle.display = "none";
document.body.append(div);
}
$('#HHContainer').css('display','inline');
$('#HHContainer').html("<div style='padding:0.75em; direction:ltr' id='HHContent'> <div style='float:right; margin:0; padding:0; font-family:sans-serif; cursor:pointer; color:#999; text-align:center; padding:1px' onclick='closeHH();'>ⓧ</div> <h4 style='text-align:center'>Homoglyph Hunter</h4> <div id=HHStatus>" + letsGo + "</div> <div id=HHMixedWords></div><br><br><div id=HHSnippets></div> </div><br><br><br><br><br><br>");
$("#FHOptViceVersa").html(viceversa?"Also show":"Skip");
$("#FHOptSort").html(sortbyscore?"magic score":"raw results count");
return;
}
function closeHH() {
$('#HHContainer').css('display','none');
}
function getHHSnippets (mixedWord, theTitle) {
var regexSearch = new mw.Api().get( {
action: 'query',
prop: 'revisions',
titles: theTitle,
rvprop: 'content',
format: 'json',
curtimestamp: '1',
} );
$.when( regexSearch ).then(function(article) {
var pages = article.query.pages;
var resultHTML = '';
startTime = article.curtimestamp;
var page;
for (var prop in pages) {
if (pages.hasOwnProperty(prop)) {
page = pages[prop];
break;
}
}
var articleText = page.revisions[0]["*"];
var contextPat = ".{0,75}" + mixedWord + ".{0,75}";
var contextRegex = new RegExp(contextPat, "g");
var myMatches = articleText.match(contextRegex);
if (myMatches) {
var displayTitle = theTitle;
var mixedWordRegex = new RegExp (mixedWord, "g");
displayTitle = displayTitle.replace(mixedWordRegex, colorizeString(mixedWord));
var latVersion = convertScript(mixedWord, Cyr2LatMap);
var cyrVersion = convertScript(mixedWord, Lat2CyrMap);
var encVersion = convertScript(mixedWord, EncErrMap);
resultHTML += '<font size=-1>';
if (latVersion != mixedWord) {
resultHTML += '(<a style="color:blue" href=# onclick=\'fixHHArticle(this, "' + mixedWord + '","' + quoteEsc(theTitle) + '", 1)\'><b>fix-latn:</b> ' + colorizeString(latVersion) + '</a>) ';
}
if (cyrVersion != mixedWord) {
resultHTML += '(<a style="color:red" href=# onclick=\'fixHHArticle(this, "' + mixedWord + '","' + quoteEsc(theTitle) + '", 2)\'><b>fix-cyrl:</b> ' + colorizeString(cyrVersion) + '</a>) ';
}
if (encVersion != mixedWord) {
resultHTML += '(<a style="color:black" href=# onclick=\'fixHHArticle(this, "' + mixedWord + '","' + quoteEsc(theTitle) + '", 3)\'><b>fix-enc:</b> ' + colorizeString(encVersion) + '</a>) ';
}
resultHTML += '(<a href="/wiki/' + quoteEsc(theTitle) + '" target=_blank>open</a>) (<a href="/w/index.php?title=' + quoteEsc(theTitle) + '&action=edit" target=_blank>edit</a>)</font> <b>' + displayTitle + '</b> <ol>';
for (var i = 0; i < myMatches.length; i++) {
var display = myMatches[i].replace(/</g, "<");
display = display.replace(/\[\[[^\]|]+]?]?|([^\s=|]+\s*=)|(<[^\s|>]*>?)|https?:\/\/[^\s|]*|(\.(jpe?g|gif|png|svg|tiff|xcf|mp3|mid|ogg|flac|wav|djvu?|pdf|tab))/ig, "<span style='background-color:#FFFF99;'>$&</span>");
display = display.replace(mixedWordRegex, "<span style='background-color:#CFC'>$&</span>");
resultHTML += '<li style="font-family:monospace">...' + display + '...</li>';
}
resultHTML += '</ol><br>';
$('#HHSnippets').append(resultHTML);
}
});
return;
}
function getHHTitles( target ) {
if (slowFetch) {
return;
}
slowFetch = 1;
var title_target = target;
if (target.length > 2) {
title_target='/' + target + '/';
}
$('#HHSnippets').html('<i>Be careful changing text in links!</i><br><br>');
var titlesearch = new mw.Api().get( {
action: 'query',
list: 'search',
format: 'json',
srlimit: '50',
srsearch: 'intitle:' + title_target
} ).fail( function( code, result ) {
if ( code === "http" ) {
alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
} else if ( code === "ok-but-empty" ) {
alert( "Error: Got an empty response from the server" );
} else {
alert( "API error: " + code );
}
return;
} );
$.when( titlesearch ).then(function(results) {
searches = results.query.search;
if (searches.length !== 0) {
$('#HHSnippets').append('<h4>Titles (' + searches.length + ') for ' + colorizeString(target) + '</h4>');
for (var i = 0; i < searches.length; i++) {
getHHSnippets(target, searches[i].title);
}
}
getHHTemplates(target);
});
return;
}
function getHHTemplates( target ) {
slowFetch = 1;
var templatesearch = new mw.Api().get( {
action: 'query',
list: 'search',
format: 'json',
srlimit: '50',
srsearch: 'template:"' + target + '"'
} ).fail( function( code, result ) {
if ( code === "http" ) {
alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
} else if ( code === "ok-but-empty" ) {
alert( "Error: Got an empty response from the server" );
} else {
alert( "API error: " + code );
}
return;
} );
$.when( templatesearch ).then(function(results) {
searches = results.query.search;
if (searches.length != 0) {
$('#HHSnippets').append('<h4>Templates (' + searches.length + ') for ' + colorizeString(target) + '</h4>');
for (var i = 0; i < searches.length; i++) {
getHHSnippets(target, searches[i].title);
}
}
getHHFullText(target);
});
return;
}
function getHHFullText( target ) {
slowFetch = 1;
var fulltextsearch = new mw.Api().get( {
action: 'query',
list: 'search',
format: 'json',
srlimit: '50',
srsearch: 'insource:' + target
} ).fail( function( code, result ) {
if ( code === "http" ) {
alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
} else if ( code === "ok-but-empty" ) {
alert( "Error: Got an empty response from the server" );
} else {
alert( "API error: " + code );
}
return;
} );
$.when( fulltextsearch ).then(function(results) {
searches = results.query.search;
if (searches.length != 0) {
$('#HHSnippets').append('<h4>Full-Text Results (' + searches.length + ') for ' + colorizeString(target) + '</h4>');
for (var i = 0; i < searches.length; i++) {
getHHSnippets(target, searches[i].title);
}
}
slowFetch = 0;
});
return;
}
function fixHHArticle( linkElem, mixedWord, theTitle, direction ) {
theTitle = quoteUnesc(theTitle);
$(linkElem).attr('onclick','');
$(linkElem).css('display', 'none');
//Get content of article
new mw.Api().get( {
action: 'query',
titles: theTitle,
prop: [ 'revisions', 'info' ],
rvprop: 'content',
indexpageids: 1,
rawcontinue: ''
} ).done( function( result ) {
var artID = result.query.pageids;
var artContents = result.query.pages[ artID ].revisions[ 0 ][ '*' ];
var mixedWordRegex = new RegExp (mixedWord, "g");
var displayMixedWord = '';
var fixMsg = '';
if (direction == 3) {
// Encoding Error
var reEncoded = convertScript(mixedWord, EncErrMap);
artContents = artContents.replace(mixedWordRegex, reEncoded );
fixMsg = 'fix encoding error: ' + mixedWord + ' → ' + reEncoded;
}
else if (direction == 2) {
// Latin to Cyrillic
artContents = artContents.replace(mixedWordRegex, convertScript(mixedWord, Lat2CyrMap) );
displayMixedWord = mixedWord.replace(LatHomoglyphRegex, "[$&]");
fixMsg = 'fix homoglyphs: convert Latin characters in ' + displayMixedWord + ' to Cyrillic';
}
else {
// Cyrillic to Latin
artContents = artContents.replace(mixedWordRegex, convertScript(mixedWord, Cyr2LatMap) );
displayMixedWord = mixedWord.replace(CyrHomoglyphRegex, "[$&]");
fixMsg = 'fix homoglyphs: convert Cyrillic characters in ' + displayMixedWord + ' to Latin';
}
new mw.Api().postWithToken( 'edit', {
action: 'edit',
title: theTitle,
text: artContents,
summary: fixMsg,
minor: '1',
starttimestamp: startTime,
} ).done( function( result, jqXHR ) {
$(linkElem).after("<b style='font-size:80%'>FIXED</b>");
return;
} ).fail( function( code, result ) {
if ( code === "http" ) {
alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
} else if ( code === "ok-but-empty" ) {
alert( "Error: Got an empty response from the server" );
} else {
alert( "API error: " + code );
}
$(linkElem).after("<b style='font-size:80%'>ERROR</b>");
return;
} );
} ).fail( function( code, result ) {
if ( code === "http" ) {
alert( "HTTP error: " + result.textStatus ); // result.xhr contains the jqXHR object
} else if ( code === "ok-but-empty" ) {
alert( "Error: Got an empty response from the server" );
} else {
alert( "API error: " + code );
}
$(linkElem).after("<b style='font-size:80%'>ERROR</b>");
return;
} );
}
function findHomoglyphs() {
if (slowFetch) {
return;
}
slowFetch = 1;
$('#HHStatus').html("<b>Fetching data... this can take 30 seconds or more.</b>");
var regexSearch = new mw.Api().get( {
action: 'query',
list: 'search',
format: 'json',
srlimit: '10000',
srsearch: 'insource:' + insourcePat
} );
$.when( regexSearch ).then(function(x) {
var matches = {};
var re = /<span class="searchmatch">(.*?)<\/span>/g;
var m;
if (x.query.search.length == 0) {
$('#HHMixedWords').html("Nothing found.");
return;
}
for (var i = 0; i < x.query.search.length; i++) {
var snip = x.query.search[i].snippet;
while (m = re.exec(snip)) {
if (typeof matches[m[1]] == 'undefined') {
matches[m[1]] = (m[1].match(LatOneRegex) || []).length / m[1].length;
if (viceversa == 1 && sortbyscore == 1 && matches[m[1]] < 0.5) {
matches[m[1]] = (m[1].match(CyrOneRegex) || []).length / m[1].length;
}
}
}
}
var terms = Object.keys(matches).sort(function(a, b) {
return matches[b] - matches[a];
});
var artCountPromises = [];
var mwapi = new mw.Api();
for (var i = 0; i < terms.length; i++) {
artCountPromises.push( mwapi.get( { action: 'query', list: 'search', format: 'json', srlimit: '1', srsearch: 'insource:' + terms[i] } ) );
}
var count = [];
var score = [];
$.when ( ...artCountPromises ).then(function() {
var results = arguments;
var resultHTML = '';
for (var i = 0; i < results.length; i++) {
count[terms[i]] = results[i][0].query.searchinfo.totalhits;
// give some weight to score, but more to Latin-ness, with a small x/1000 addition to sort 0-count items properly
score[terms[i]] = Math.log10(count[terms[i]] + 1) * matches[terms[i]] * matches[terms[i]] + (matches[terms[i]]/1000);
}
terms = terms.sort(function(a, b) {
if (sortbyscore) {
return score[b] - score[a];
}
return count[b] - count[a];
});
for (var i = 0; i < terms.length; i++) {
if (score[terms[i]] <= 0 && viceversa == 0) {
continue;
}
if (viceversa == 1 || matches[terms[i]] >= 0.5) {
var display = colorizeString(terms[i]);
if (resultHTML) {
resultHTML += ' — ';
}
resultHTML += "<a href='#' style='color:black' onclick='copyToClipboard(\"" + terms[i] + "\"); getHHTitles(\"" + terms[i] + "\")'>" + display + "</a> (" + count[terms[i]] +
// "/" + matches[terms[i]] + "/" + score[terms[i]] +
")";
}
}
if ('' === resultHTML) {
resultHTML = "Nothing found.";
}
$('#HHStatus').html('');
$('#HHMixedWords').html(resultHTML);
} );
slowFetch = 0;
} );
return;
}
function colorizeString(str) {
var str2 = [];
for (var i = 0; i < str.length; i++) {
if (str[i].match(LatOneRegex)) {
// Latin
if (str[i].match(LatHomoglyphRegex)) {
// Latin homoglyph
str2.push("<span style='color:#00F'>" + str[i] + "</span>")
}
else {
str2.push("<span style='color:#AAF'>" + str[i] + "</span>")
}
}
else if (str[i].match(CyrOneRegex)) {
// Cyrillic
if (str[i].match(CyrHomoglyphRegex)) {
// Cyrillic homoglyph
str2.push("<span style='color:#F00'>" + str[i] + "</span>")
}
else {
str2.push("<span style='color:#FAA'>" + str[i] + "</span>")
}
}
else {
// Hmm, what's this?
str2.push(str[i]);
}
}
str2 = str2.join('');
return str2;
}
// copy a string to the clipboard
function copyToClipboard(string) {
var $temp = $("<input>");
$("body").append($temp);
$temp.val(string).select();
document.execCommand("copy");
$temp.remove();
}
// invert one map into another; in both directions remove keys (but not values) with length > 1
function invertAndLengthFilter(src, dest) {
for (var key in src) {
var cyr = src[key];
if (cyr.length == 1) {
dest[cyr] = key;
}
if (key.length > 1) {
delete src[key];
}
}
}
// map all available characters in string from one script to another based on a given map
function convertScript(str, map) {
var str2 = [];
for (var i = 0; i < str.length; i++) {
str2.push( map[str[i]] || str[i] );
}
str2 = str2.join('');
return str2;
}
// html encode quotes
function quoteEsc (theString) {
theString = theString.replace(/'/g, "%27");
theString = theString.replace(/"/g, "%22");
return theString;
}
// html decode quotes
function quoteUnesc (theString) {
theString = theString.replace(/%27/g, "'");
theString = theString.replace(/%22/g, '"');
return theString;
}
// when everything is loaded, add the Homoglyph Hunter link
$.when( mw.loader.using( ['mediawiki.util']), $.ready ).then( function() {
var portletLink = mw.util.addPortletLink( 'p-tb', '#', 'Homoglyph Hunter' );
$( portletLink ).click( function ( e ) {
e.preventDefault();
initialize_HHunter();
});
});