User:Mutley1989/Scripts
Some scripts that I have written for tasks on wikipedia, and to learn both how to work programmatically with wikipedia, and become more familliar with pywikipediabot. Comments, criticism, questions, suggestions etc. welcome.
Python script to find links incorrectly tagged with disambiguation templates, used in response to this request. Generates a lot of false positives and therefore the results need manual inspection and editing. One possible improvement would be testing if the link tagged with {{dn}}
has changed since it was tagged, although this would obviously miss instances where the destination page has been changed from a disambiguation page. Depends on pywikipediabot.
#!/usr/bin/python
import re
import wikipedia, catlib, pagegenerators
import webbrowser
def get_disam_links(page):
"""
Returns a list of linked page title that have
a {{Disambiguation Needed}} template from a given page.
"""
disam_re = re.compile(r"\{\{Disambiguation Needed(\|date=|\}\})|" +
r"\{\{dn(\|date=|\}\})", re.I)
res = []
found = disam_re.search(page)
while found:
try:
link_start = page.rindex("[[", 0, found.start())
except ValueError:
return []
link_end = min(page.index("|", link_start),
page.index("]]", link_start))
res.append(page[link_start + 2:link_end])
found = disam_re.search(page, found.end())
disam_dep_re = re.compile(
r"\{\{Disambiguation Needed\|(?!date=)[^|}]*(\|[^|}]*)?(\|date=[^}]*)?\}\}|" +
r"\{\{dn\|(?!date=)[^|}]*(\|[^|}]*)?(\|date=[^}]*)?\}\}",
re.I)
found_dep = disam_dep_re.search(page)
while found_dep:
res.append(found_dep.group().strip("{}").split("|")[1])
found_dep = disam_re.search(page, found_dep.end())
return res
def find_fulfilled_dn_templates(category_title, start=None):
"""
Returns a list of wikipedia.Page objects that have {{dn}} templates
preceded by, or containing a link that doesn't lead to a Disambiguation
page
"""
site = wikipedia.getSite()
category = catlib.Category(site, category_title)
catgen = pagegenerators.CategorizedPageGenerator(category, start=start)
res = []
try:
for article in catgen:
exists = False
print "\nPAGE",article
link_titles = get_disam_links(article.get())
for link in link_titles:
link_page = wikipedia.Page(site, link)
print link_page
while link_page.isRedirectPage():
link_page = link_page.getRedirectTarget()
print "redirecting", link_page
if link_page.exists() and not link_page.isDisambig():
print "***********true**********"
exists = True
else:
print "false"
if exists:
res.append(article)
except:
import traceback
traceback.print_exc()
return res
return res
Python script written for this request. Depends on pywikipediabot and the infobox script below.
#!/usr/bin/python
import infobox
import wikipedia
def get_languages():
"""Hackish and fragile, any changes to the page will probably break it"""
site = wikipedia.getSite()
langs = wikipedia.Page(site, "Wikipedia:WikiProject Languages/Primary language names in Ethnologue 16 by ISO code").get()
langs = langs[langs.find("[[", langs.find("==Codes==")):
langs.rfind("]]", 0, langs.find("</tt>")) + 2]
language_list = [lang.strip("[]") for lang in langs.split("\n")]
return [tuple(lang.split("|")) for lang in language_list]
def check_languages(start=None, end=None):
res = []
disams = []
misc = []
site = wikipedia.getSite()
for language in get_languages()[start:end]:
try:
lang_page = wikipedia.Page(site, language[0])
if lang_page.exists():
while lang_page.isRedirectPage():
lang_page = lang_page.getRedirectTarget()
if lang_page.isDisambig():
disams.append(language)
# print "disambiguation", language
continue
try:
parsed_infobox = infobox.infobox_parse(lang_page)
except Exception:
# print "parse error", language
misc.append(language)
continue
params = [parsed_infobox[key] for key in parsed_infobox
if key.startswith("lc") or key == "iso3"]
if all(param != language[1] for param in params):
# print "param", language
res.append(language)
except Exception:
# print "other error", language
misc.append(language)
return res, disams, misc
Python script to extract the first infobox from a page, and return a dict of the parameters and their values. Only tested on simple infoboxes, probably fails on some others. Depends on pywikipediabot.
#!usr/bin/python
# Adapted from:
# http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
import re
import sys
import wikipedia
def get_infobox_from_text(article_text):
#Build a regexp to get the source artery from the artery infobox
exp = r'\{\{' # the opening brackets for the infobox
exp = exp + r'\s*' # any amount of whitespace
exp = exp + r'[Ii]nfobox +' # the word "infobox", capitalized or not followed by at least one space
# if box_title:
# exp = exp + box_title # the infobox title, capitalized or not
# exp = exp + r'\s*\|' # any number of spaces or returns followed by a pipe character
exp = exp + r'.*' # a bunch of other stuff in the infobox
exp3 = exp # save the regexp so far so that I can use it later
exp3 = exp3 + r'.*\}\}' # any amount of anything, followed by the end of the infobox
exp3_obj = re.compile(exp3, re.DOTALL)
search_result = exp3_obj.search(article_text)
if search_result:
result_text = search_result.group(0) # returns the entire matching sequence
else:
return None
# the regex isn't perfect, so look for the closing brackets of the infobox
count = 0
last_ind = None
for ind, c in enumerate(result_text):
if c == '}':
count = count -1
elif c == '{':
count = count +1
if count == 0 and not ind == 0:
last_ind = ind
break
return result_text[0:last_ind+1]
def parse_infobox_text(text):
text = text.split('|')
text = text[1:] #everything before the first pipe is the infobox declaration
new_list = [text[0]]
for item in text[1:]:
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
if (']]' in item) and ((not '[[' in item) or item.find(']]') < item.find('[[')):
new_list[-1] = new_list[-1] +'|' + item
else:
new_list.append(item)
new_list[-1] = new_list[-1][:-2] #trim off the closing brackets
data_dict = {}
for item in new_list:
if '=' in item:
items = item.split('=', 1)
data_dict[items[0].strip()] = items[1].strip()
else:
continue
return data_dict
def infobox_parse(article):
"""article: wikipedia.Page object"""
while article.isRedirectPage():
article = article.getRedirectTarget()
article_text = article.get()
return parse_infobox_text(get_infobox_from_text(article_text))
Simpler and probably more robust appoach to infobox parsing, using wikipedia.Page.templatesWithParams(). Depends on pywikipediabot.
#!/usr/bin/python
import wikipedia
def parse_infoboxes(page, *template_titles):
"""
Returns a list of parsed templates that have the titles given, or all
starting with "Infobox" if not given.
page: wikipedia.Page object
"""
templates = []
res = []
if template_titles:
for title in template_titles:
templates = [template for template in page.templatesWithParams()
if template[0] in template_titles]
else:
templates = [template for template in page.templatesWithParams()
if template[0].startswith("Infobox")]
for template in templates:
template_dict = {}
for param in template[1]:
if "=" in param:
split_param = param.split("=", 1)
template_dict[split_param[0].strip()] = split_param[1].strip()
res.append(template_dict)
return res
chart_references.py
[edit]Script for this request.
#!/usr/bin/python
import wikipedia
import bs4
import catlib
def main():
site = wikipedia.getSite()
cat = catlib.Category(
site, "Category:Singlechart making named ref").articles()
res = []
for page in cat:
# print page
if has_ref_conflict(page):
# print "found"
res.append(page)
return res
def has_ref_conflict(page):
single_refnames = set()
for tem in page.templatesWithParams():
if tem[0].lower() == "singlechart":
for param in tem[1]:
if param.startswith("refname"):
single_refnames.add(param[param.find("=") + 1:].strip('"'))
break
refnames = set()
ref_tags = bs4.BeautifulSoup(page.get()).find_all("ref")
for tag in ref_tags:
if tag.has_attr("name") and tag.contents and not tag.is_empty_element:
refnames.add(tag.attrs["name"])
return refnames & single_refnames
merge_template.py
[edit]#!/usr/bin/python
import wikipedia
import catlib
def main(sim=True):
site = wikipedia.getSite()
wikipedia.simulate = sim
# wikipedia.verbose = 1
cat = catlib.Category(
site, "Category:All articles to be merged").articles()
res = []
for page in cat:
print page
if page.namespace(): # talk pages are inconsistant, there are only 45
print "namespace: ", page.title()
continue
for tem in page.templatesWithParams():
if tem[0].lower().startswith("merge"):
merge_targets = []
for i, param in enumerate(tem[1]):
if "=" not in param:
merge_targets.append(wikipedia.Page(site, param))
else:
remaining_params = [p for p in tem[1][i:]
if (p.lower().startswith("date=")
or p.lower().startswith("discuss="))]
break
break
else:
continue # no merge template found
for target_page in merge_targets:
if not [target_tem
for target_tem in target_page.templatesWithParams()
if target_tem[0].lower().startswith("merge")]:
new_text = u"{{"
if tem[0].lower() == "merge to":
new_text += u"Merge From"
elif tem[0].lower() == "merge":
new_text += u"Merge"
elif tem[0].lower() == "merge from":
new_text += u"Merge to"
new_text += u"|" + page.title()
if remaining_params:
new_text += u"|" + u"|".join(remaining_params)
new_text += u"}}\n\n"
new_text += target_page.get()
print new_text.encode("utf-8") + "\n\n"
if raw_input("Edit " + target_page.title().encode("utf-8") + " ?"
) == "y":
target_page.put(new_text, comment=u"Add merge template")