Jump to content

User:Monk of the highest order/ASE/code

From Wikipedia, the free encyclopedia

This is the code I used to calculate Wikipedia articles which have only seen one human editor (usually the page creator). The last time I ran this was two years ago, it produced a list about 2000 entries long which since has been whittled down to about 100 or so - in other words all but one hundred have seen review. I'll probably run this script again soon, accounting for those articles already reviewed from the first run. When I do that, I'll clean these up, re-organize, give more meaningful filenames, etc.

xmlsplitter.py

[edit]
#XMLsplitter.py
#V03
#Released under GNU GPLv3 by Monk of the Highest Order, 2008.

#Partitions a giant XML document
#into smaller documents without breaking content across
#a selected element. So for example, if the element is
#<artist> all data between that and the </artist> tag is kept in the
#same doc.


import re, random
from utility import *
from sys import exit

#example exml doc:
#<base>
#        <mid1>
#                <mid2>
#                        <pageunit>
#                                change10
#                                change 9
#                        </pageunit>
#                        ...repeat x100000000000....
#                </mid2>
#        </mid1>
#<base>

#basic idea: Strip base, mid1, mid2 (why even worry)
# just make files which just contain distinct page data
# I wouldn't make 1 file for each page. Not sure the
# file system could handle 2mil files. I'd stay safe
# at something like 2k. Named numerically, probably,
# so we don't need to get into title extraction
# so 2.2 mil / 2000 files = 1.1*10^3 pages ea.

def interpret(textline, pagecount, parent_tags, data_to_get):
    if re.search('(?i)</' + data_to_get + '>', textline):
        pagecount+=1
        #print('page' + str(pagecount))
    elif re.search('(?i)^\s*</?(' + parent_tags + ')>\s*$',
        textline):
        return None, pagecount
    return textline, pagecount

def get_pages_per_file(rel_position):
    #input a float giving the relative
    #position of the file break-up-er
    #in the big meta file, where 0.0 == the beginning
    #and 1.0 == the end.
    if rel_position < 0.1:
        pages_per_file=200
    elif rel_position < 0.3:
        pages_per_file=800
    elif rel_position < 0.5:
        pages_per_file=1200
    elif rel_position < 1.0:
        pages_per_file=2600
    else:
        print("error! no rel position within 0-1.0", repr(rel_position))
        exit()
    return pages_per_file

def main():
        (sourcexml, pos, filenum)=unpickle_data('xmlsplitter.tmp', 
            ['1008smh.xml', 0, 1])
        nwiki=2600000. #estimate of the number of elements
        nfilegoal=16000. #estimate of number of pages desired
        output_folder='output/'
        data_to_get='page'
        parent_tags='mediawiki'
        
        fbig=open(sourcexml, 'r')
        fbig.seek(0, 2)
        eof_loc = fbig.tell()
        fbig.seek(pos)

        pages_per_file = get_pages_per_file(pos/float(eof_loc))
        
        while fbig.tell() < eof_loc:
            if filenum >= nfilegoal: exit()
            newblock = []
            pagecount = 0
            fblock = open(output_folder + \
                str(filenum) + '.block', 'w')
            
            next=fbig.tell()
            while pagecount < pages_per_file and next < eof_loc:
                prev=next
                try:newline, pagecount = interpret(fbig.readline(), 
                    pagecount, parent_tags, data_to_get)
                except IOError:
                    print("IOError... waiting it out.")
                    fbig.seek(prev+30)
                    pass
                next=fbig.tell()
                if next > eof_loc:
                    next=prev+30
                    fbig.seek(next)
                if newline: newblock.append(newline)
            
            newblock.append('</block>')
            newblock.insert(0,'<block>\n')
            print(fbig.tell(), eof_loc)

            fblock.writelines(newblock)
            fblock.flush()
            fblock.close()

            rel_position=fbig.tell()/float(eof_loc)
            pages_per_file=get_pages_per_file(rel_position)
            print("File " + str(filenum) + " (" + \
            str(int(rel_position*100)) + \
            "%) written.")
            filenum+=1
            pickle_data('xmlsplitter.tmp', [sourcexml, fbig.tell(), filenum])

if __name__ == '__main__':
    main()

parser.py

[edit]
#The structure of this program is designed not around speed, but around
#memory constraints. It is assumed that you have lotsa space and lotsa time.
#TODO:
#Output file
#Cleanup constants -> (eg, one file should handle the constant locations of
# the bot list, the redirect list, ids-editors db, one-editor folder, etc.
# probably this folder)

import sys
import re
import csv
import optparse
from xml.sax import make_parser, handler
import sqlite3
from glob import glob
try: from urllib.parse import quote
except: from urllib import quote

import utility
import pageparser_db
import wiki_pageset
import one_authorize
from xml_to_pageset import WikiXMLParser


BOT_NAMES_LIST='bot_list.txt'
BOT_IDS_LIST='bot_list_ids.txt'

def get_bots_list(value='names'):
    try:
        if value=='names':
            fbots=open(BOT_NAMES_LIST,'r')
        elif value=='ids':
            fbots=open(BOT_IDS_LIST,'r')
        bots=fbots.readlines()
        for i in range(len(bots)-1):
            bots[i] = quote(bots[i].rstrip())
        fbots.close()
        bots.append('Conversion%20script')
        return sorted(bots)
    except IOError:
        print(" error: could not read one of bots list filez")
        sys.exit()

##### Command System #####

if __name__=='__main__':
        command = optparse.OptionParser()
        command.set_usage("""
        Usage: parser.py [-v/-q]
        [-1 1.xml 2.xml 3.xml...]
        [-f 1.xml.csv 2.xml.csv...]
        [-2 1.xml.csv 2.xml.csv...]
        [-3 1.xml.csv 2.xml.csv...]
        [-4 1.xml.csv 2.xml.csv...]
        [-5 1.xml.inx.csv 2.xml.inx.csv...]
        """)
        command.add_option("-1", "--xml_decode",
                action="store_true",
                dest="xml_decode",
                help="XML -> CSV 'pageset' of pagename, pageid, editorid, and edits by editor id")
        command.add_option("-f", "--filter_csv",
                action="store_true",
                dest="filter_csv",
                help="refilter a csv file for bots, userpages, etc...")
        command.add_option("-2", "--fill-editor-db",
                action="store_true",
                dest="fill_editor_db",
                help="add CSV pageset data to: sqlite db of edit count per page by each user.")
        command.add_option("-t", "--tally-editor-db",
                action="store_true",
                dest="tally",
                help="run (2) on every pageset available, then run this, before using option (4)")
        command.add_option("-3", "--one-editor",
                action="store_true",
                dest="one_editor",
                help="CSV pageset -> new CSV with one-editor pages only")
        command.add_option("-4", "--inexp-editor",
                action="store_true",
                dest="inexp_editor",
                help="""CSV pageset -> new CSV with with one author only,
                    with that author having less than 15 edits to his name
                    (completely fill the SQLITE database b4 using this option).""")
        command.add_option("-5", "--title-list",
                action="store_true",
                dest="title_list",
                help="CSV pageset -> list of pages within by title")
        command.add_option("-i", "--id-list",
                action="store_true",
                dest="id_list",
                help="CSV pageset -> list of pages within by id")
        command.add_option("--gt_ids",
                action="store",
                dest="gt_ids",
                help="necessary for -4: list of the userids whose editcounts qualify them as experienced")
        command.add_option("--gt_ips",
                action="store",
                dest="gt_ips",
                help="necessary for -4: list of the ips whose editcounts qualify them as experienced")
        #command.add_option("-X", "--mult-editors",
        #        action="store_true",
        #        dest="make_list",
        #        help="CSV pageset -> new CSV of pages with more than one editor.")
        command.add_option("-v", "--verbose",
                action="store_true",
                dest="output_verbose",
                help="option: give lots of debug output")
        command.add_option("-q", "--quiet",
                action="store_true",
                dest="output_quiet",
                help="option: No command line output")
        (options, args) = command.parse_args(sys.argv[1:])
        
        if options.output_quiet:
            verbose=0
        elif options.output_verbose:
            verbose=2
        else:
            verbose=1
        
        #testing for usability of command line options...
        operations=options.__dict__
        j=0
        for i in operations:
            if i not in ['output_quiet', 'output_verbose', 'gt_ids', 'gt_ips'] and \
                operations[i]:
                if verbose: print(str(i))
                j+=1
                if j==2:
                    print(str(i))
                    command.print_usage()
                    sys.exit()
        if j==0:
                command.print_usage()
                sys.exit()
        
        if True:
            #if we're using an option which only uses file(s) as the argument(s)
            if not args:
                print(' error:this operation requires at least one file argument')
                sys.exit()
            elif [] in [glob(x) for x in args]:
                print(' error:this operation requires all arguments to be files.')
                sys.exit()
            args=utility.glob_list(args)
 
###### operations ######
        if options.xml_decode:
            parser = make_parser()
            parser.setContentHandler(WikiXMLParser(verbose=verbose))
            cleaner = wiki_pageset.PageFilter(verbose=verbose,
             bot_ids=get_bots_list('ids'), bot_names=get_bots_list('names'))
            
            for arg in args:
                if verbose: print(" opening file",arg)
                parser.parse(arg)
                pages=cleaner.clean(parser.getContentHandler().pages, 
                    rm_bot_revisions=True,
                    rm_user_talk=True,
                    rm_redirects=True,
                    associate_to=False,
                    associate_from=True,
                    rm_usernames=True)
                if verbose: print(" done.")
                csv_store_pageset(arg+'.csv', pages)
        elif options.filter_csv:
            cleaner = wiki_pageset.PageFilter(verbose=verbose,
             bot_ids=get_bots_list('ids'), bot_names=get_bots_list('names'))
            for arg in args:
                if verbose: print(" opening file",arg)
                pageset=wiki_pageset.csv_load_pageset(arg)
                pageset=cleaner.clean(pageset, 
                    rm_bot_revisions=False,
                    rm_user_talk=True,
                    rm_redirects=False,
                    associate_to=False,
                    associate_from=False,
                    rm_usernames=False)
                if verbose: print(" done.")
                wiki_pageset.csv_store_pageset(arg[:-4] + '.f.csv', pageset)
        elif options.fill_editor_db:
            editor_db = one_authorize.EditsByUser(verbose=verbose)
            for arg in args:
                if verbose: print(" opening file",arg)
                pageset=wiki_pageset.csv_load_pageset(arg)
                userids, ip_addrs=editor_db.get_edits_by_user(pageset)
                utility.csv_write(arg[:-4]+'.editors_ids.csv', userids)
                utility.csv_write(arg[:-4]+'.editors_ips.csv', ip_addrs)
        elif options.one_editor:
            for arg in args:
                if verbose: print(" opening file",arg)
                pageset=wiki_pageset.csv_load_pageset(arg)
                pageset2=[]
                for page in pageset:
                    editors=set()
                    if verbose==2: print("  going thru pageset")
                    for revision in page.revisions:
                        editors.add(revision["contributorID"])
                        if len(editors)>1:
                            break
                    else:
                        pageset2.append(page)
                wiki_pageset.csv_store_pageset(arg[:-4]+'.one_edtr', pageset2)
                if verbose: print(" done")
        elif options.inexp_editor:
            for arg in args:
                if verbose: print("opening file", arg)
                pageset_listform=utility.csv_read(arg)
                if not options.gt_ips or not options.gt_ids:
                    print("""ERROR. you need to provide a list of
                      'experienced users' for this operation... both
                      by ip and userid. see --help""")
                    sys.exit()
                editor_db = one_authorize.EditsByUser(verbose=verbose)
                pageset2 =editor_db.get_inx_pages(pageset_listform,
                    ips_gt=options.gt_ips,
                    ids_gt=options.gt_ids)
                utility.csv_write(arg[:-4]+'.inx_edtr', pageset2)
                if verbose: print(" done")
        elif options.id_list or options.title_list:
            if options.idlist:
                ext='.pageids'
                columnpos=1
            else:
                ext='.titles'
                columnpos=0
            for arg in args:
                if verbose: print(" opening file",arg)
                f_arg=open(arg,'r')
                f_output=open(arg+ext,'w')
                f_arg.seek(0,2)
                eof_loc=f_arg.tell()
                f_arg.seek(0)
                while f_arg.tell() < eof_loc:
                    line_buffer=[]
                    for i in range(800):
                        line_buffer.append(f_arg.readline())
                        line_buffer.remove('') #in case we exceed the end of the file
                    if verbose: print("  progress:", float(100*f_arg.tell())/eof_loc)
                    splitted=wiki_pageset.csv_load_pageset(line_buffer, isfile=False)
                    page_attr_list=[x[columnpos] + '\n' for x in splitted]
                    f_output.writelines(page_attr_list)
                    f_output.flush()
                    del splitted
                    del page_attr_list
                f_titles.close()
                if verbose: print(" done.",arg)
        elif options.tally:
            editor_db = one_authorize.EditsByUser(verbose=verbose)
            if verbose: print(" start")
            editor_db.fill_edit_db(input_files=args, editcount_folder='/opt/editcounts/')
            if verbose: print(" done")

pageparser_db.py

[edit]

much of this is obsolete and no longer used... sqlite is rather no good for some high load things, I feel. :* just kidding, I'm just no good at sqlite optimization

import sqlite3,sys

#5555555555555555
# DB operations 5
#5555555555555555

ID_TO_NAME = {}
ID_TO_NAME['filename']='ids_to_names.sqlite'
ID_TO_NAME['creation_schema']="CREATE TABLE contributors(contributorID text PRIMARY KEY,username text)" #USERIDS MUST NOT BE STRONGLY TYPED AS INTS: SEVERAL OF THE EARLIER IDS WERE IN ASCII, AND ARE NOT CONVERTIBLE TO INTS.
ID_TO_NAME['table_list']=['contributors']

EDITCOUNT = {}
EDITCOUNT['filename']='editcount.sqlite'
EDITCOUNT['creation_schema']="CREATE TABLE total_edits(contributorID INTEGER PRIMARY KEY,editcount INTEGER)" #USERIDS MUST NOT BE STRONGLY TYPED AS INTS (in sqlite, int is the only type which can be strongly typed, and that is by using the term INTEGER): SEVERAL OF THE EARLIER IDS WERE IN ASCII, AND ARE NOT CONVERTIBLE TO INTS.
EDITCOUNT['table_list']=['total_edits']


REDIRECTS = {}
REDIRECTS['filename']='redirects.sqlite'
REDIRECTS['creation_schema']="CREATE TABLE redirects(idnum INTEGER NOT NULL UNIQUE)"
REDIRECTS['table_list']=['redirects']


def connect_base(filename, creation_schema, table_list):
    base=sqlite3.connect(filename)
    cu=base.cursor()
    cu.execute("select tbl_name from sqlite_master where type='table' order by tbl_name")
    tables = []
    for row in cu.fetchall():
        tables.extend(row)
    #print(repr(tables))
    if tables==[]:
        cu.execute(creation_schema)
        base.commit()
    elif table_list[0] not in tables:
        print(filename, " db has unknown schema. please fix manually.")
        sys.exit()
    return base, cu

def connect_contributor_id_base():
    return connect_base(ID_TO_NAME['filename'],
        ID_TO_NAME['creation_schema'], ID_TO_NAME['table_list'])

def connect_editcount_base(basemodulo):
    return connect_base('/opt/editcounts/'+str(basemodulo)+EDITCOUNT['filename'],
        EDITCOUNT['creation_schema'], EDITCOUNT['table_list'])

def connect_redirect_base():
    return connect_base(REDIRECTS['filename'],
        REDIRECTS['creation_schema'], REDIRECTS['table_list'])
    
# <<<<<<<<<<<<>>>>>>>>>>>>
# < ID_to_Name functions >
# <<<<<<<<<<<<>>>>>>>>>>>>

def associate(contributorID, username):
    base, cu = connect_contributor_id_base()
    try:
        results=cu.execute('INSERT INTO contributors(contributorID,username) values (?,?)', (contributorID,username))
    except sqlite3.IntegrityError:
        return None
    base.commit()
    base.close()
    return results

def get_username(contributorID):
    base, cu = connect_contributor_id_base()
    cu.execute('SELECT username FROM contributors WHERE contributorID=?',(contributorID,))
    rows=[]
    for row in cu.fetchall():
        rows.extend(row)
    base.close()
    return rows

wiki_pageset.py

[edit]

for understanding and filtering sets of page history for bots, redirects, etc. parser.py is used to load and call the classes and functions in here, usually.

import utility, pageparser_db, sqlite3
try: from urllib.parse import quote
except: from urllib import quote
from time import time #for benchmarking purposes

class PageHistory():
    def __init__(self):
        self.title=None
        self.idnum=None
        self.revisions=[]

def csv_store_pageset(filename, cleaned_pageset):
    '''a pageset is a list [] of PageHistory objects'''
    #WARNING: strips all username and character data
    
    writable_pageset = [utility.flatten_list([page.title, page.idnum,
      [revision['contributorID'] for revision in \
       page.revisions]]) for page in cleaned_pageset]
    #for page in cleaned_pageset:
    #    page.revisions = [revision['contributorID'] for revision in page.revisions]
    #    writable_pageset[-1].extend([page.title, page.idnum, page.revisions])
    if filename.split('.')[-1] !='csv':
        filename+='.csv'
    utility.csv_write(filename,writable_pageset)
    return True

def csv_load_pageset(filename, isfile=True):
    pageset=[]
    csv_data = utility.csv_read(filename, isfile)
    for row in csv_data:
        pageset.append(PageHistory())
        pageset[-1].title=row[0]
        pageset[-1].idnum=row[1]
        pageset[-1].revisions=[{'contributorID':contributorID, 'username':'', 'comment':''} for contributorID in row[2:]]
    return pageset

######################
# Massive pageset filterer
######################

class PageFilter():

    def __init__(self, verbose=0,bot_names=[],bot_ids=[]):
        self.verbose=verbose
        if self.verbose: print(" loading data to clean pagesets")
        
        #redirect stuff....
        #int version (by pageid, but those don't always work, trust me...
        """redirect_list=[int(x) for x in redirect_list]
        dictum={}
        for i in range(100):
            dictum[i]=[]
        for item in redirect_list:
            dictum[item % 100].append(item)"""
            
        #str version
        #f_r_list=open('TLR4')
        #redirect_list=f_r_list.readlines()
        #dictum={}
        #for item in redirect_list:
        #    if item[:2] not in dictum:
        #        dictum[item[:2]]=[]
        #    dictum[item[:2]].append(item.rstrip())
        #PageFilter.redirect_complex=dictum
        #del redirect_list
        #f_r_list.close()
        
        PageFilter.bot_ids=bot_ids
        PageFilter.bot_names=bot_names
        if self.verbose==2: print("  Connecting to sqlite database of userid-username pairs.")
        #sqlite database with a single table with userid as primary key and username as the other value
        PageFilter.id_base, PageFilter.id_cu= \
         pageparser_db.connect_contributor_id_base()
        
    def clean(self, pageset, rm_bot_revisions=True, rm_user_talk=True,
        rm_redirects=True, associate_to=False, associate_from=False,
        rm_usernames=True):
        
        if verbose==2:timer={"redirects":0,"user_talk":0,
            "associate to/from":0, "revisions":0, "bot_revisions":0, 
            "bots2":0, "rm_usernames":0, "rm_unnec_revisions":0, 
            "rm_unnec_pages":0,"commit":0}
        if verbose==2: eop=len(pageset)
        if verbose==2: prev='0'
        
        unnec_pages = []
        if associate_from: PageFilter.id_cu.execute('BEGIN;')
        for pagenum in range(len(pageset)):
            if verbose==2: tmptime=time()
            if verbose==2: cur=str(int((pagenum/float(eop))*100))
            if verbose==2: 
                if self.verbose and len(cur)>1 and cur[0] != prev[0]: print(cur)
            if verbose==2: prev=cur
            if rm_redirects:
                title=pageset[pagenum].title
                #idnum=int(pageset[pagenum].idnum)
                #if idnum in PageFilter.redirect_complex[idnum%100]:
                if title[:2] in PageFilter.redirect_complex and \
                  title in PageFilter.redirect_complex[title[:2]]:
                    if verbose==2: timer['rm_unnec_pages']+=1
                    if verbose==3: print('found redirect', title)
                    unnec_pages.append(pagenum)
            if verbose==2: timer['redirects']+=(time()-tmptime)
            if verbose==2: tmptime=time()
            if rm_user_talk:
                if re.search('(?i)^(talk|help((\s|\%20)talk)?|wikipedia((\s|\%20)talk)?|user((\s|\%20)talk)?|image((\s|\%20)talk)?|file((\s|\%20)talk)?|category((\s|\%20)talk)?|template((\s|\%20)talk)?|portal((\s|\%20)talk)?)(:|\%3A)',
                pageset[pagenum].title):
                    unnec_pages.append(pagenum)
                    continue
            if verbose==2: timer['user_talk']+=(time()-tmptime)
            unnec_revisions=[]
            for revision_num in range(len(pageset[pagenum].revisions)):
                revision=pageset[pagenum].revisions[revision_num]
                if verbose==2: tmptime=time()
                if associate_to:
                    PageFilter.id_cu.execute('SELECT username FROM contributors WHERE contributorID=?',(revision['contributorID'],))
                    name=PageFilter.id_cu.fetchone()
                    if name:
                        pageset[pagenum].revisions[revision_num]['username'] = name[0]
                elif associate_from and revision['username']: #associate from pageset into base
                    try:
                        PageFilter.id_cu.execute('INSERT INTO ' + \
                        'contributors(contributorID,username)' + \
                        'values (?,?)', (revision['contributorID'],
                        str(revision['username'])))
                    except sqlite3.IntegrityError:
                        pass
                if verbose==2: timer['associate to/from']+=(time()-tmptime)
                if rm_bot_revisions:
                    if verbose==2:tmptime=time()
                    if revision['username'] in PageFilter.bot_names or \
                       revision['contributorID'] in PageFilter.bot_ids:
                        unnec_revisions.append(revision_num)
                        if verbose==2: timer['bot_revisions']+=1
                    elif 'bot' in revision['username'][-4:].lower() or \
                      'bot' in revision['comment'].lower():
                        #print("possible bot detection - ", revision['username'], 
                        #"not on list...")
                        unnec_revisions.append(revision_num)
                        if verbose==2: timer['bot_revisions']+=1
                    if verbose==2: timer['revisions']+=1
                    if verbose==2: timer['bots2']+=(time()-tmptime)
                if verbose==2: tmptime=time()
                if rm_usernames:
                    pageset[pagenum].revisions[revision_num] = {'contributorID':revision['contributorID']} #this must occur AFTER botcheck.
                if verbose==2: timer['rm_usernames']+=(time()-tmptime)
            unnec_revisions.reverse() #items must be removed in reverse order
            #or a removal will shift the index numbers of all later list items
            for entry_num in unnec_revisions:
                del pageset[pagenum].revisions[entry_num]
        #tmptime=time()
        if verbose==2: timer['commit']=len(pageset)
        unnec_pages.reverse() #items must be removed in reverse order
        for entry_num in unnec_pages:
            del pageset[entry_num]
        #timer['rm_unnec_pages']+=(time()-tmptime)
        #tmptime=time()
        if verbose==2: print("   committing id base")
        PageFilter.id_base.commit()
        #timer['commit']+=(time()-tmptime)
        if verbose==2: print("  done cleaning.")
        if verbose==2:
            for i in timer:
                print("    ", i, "  |  ", str(timer[i])[:5])
        return pageset

    def only_one_contributor(pageset):
        one_author_pageset=[]
        for pagehistory in pageset:
            num_authors=set([x['contributorID'] for x in pagehistory.revisions])
            if len(num_authors)==1:
                one_author_pageset.append(pagehistory)
        return one_author_pageset

xml_to_pageset.py

[edit]

The core function of making use of all that xml. parser.py is used to load and call the classes and functions in here, usually.

from xml.sax import make_parser, handler
try: from urllib.parse import quote
except: from urllib import quote
import wiki_pageset

class WikiXMLParser(handler.ContentHandler):
    """Converts the XML data into a form that can be more
    easily handled en Masse by python. While it is doing
    this, it strips the data of everything but page titles,
    page ids, and a list of revisions for each page. The
    list of revisions includes only the contributor and the
    comment, (including both the comment and the contributor
    name as well as ID or IP as to provide an opportunity to
    filter out bots), and does not even include dates"""

    important_tags = {
            ('contributor','revision'):'contributor',
            ('username','contributor'):'username',
            ('comment','revision'):'comment',
            ('revision','page'):'revision',
            ('id','page'):'pageID',
            ('id','contributor'):'contributorID',
            ('ip','contributor'):'contributorID',
            ('title','page'):'pagetitle'
        }
    important_tags_reverse={}
    for tag in important_tags:
        important_tags_reverse[(tag[0],important_tags[tag])]=tag[1]
    
    def __init__(self, verbose=0):
        self.verbose=verbose
        pass
    
    def set_filename(self, filename): self.filename=filename
    
    def startDocument(self):
        self._elems = 0
        self._attrs = 0
        self.pages = []
        self.parent = 'page'
        self.current = None
        if self.verbose: print('  reading XML...')

    def startElement(self, name, attrs):
        self._elems = self._elems + 1
        #self._attrs = self._attrs + len(attrs)
        if name == 'page':
            self.current = wiki_pageset.PageHistory()
            self.parent='page'
        elif name == 'revision':
            self.current.revisions.append({'contributorID':'', 'username':'', 'comment':''})
            self.parent = 'revision'
        elif (name,self.parent) in FancyCounter.important_tags:
            self.parent = FancyCounter.important_tags[(name,self.parent)]

    def endElement(self, name):
        if name == 'page':
            self.pages.append(self.current)
            del self.current
        elif (name,self.parent) in FancyCounter.important_tags_reverse:
            self.parent=FancyCounter.important_tags_reverse[(name,self.parent)]

    def characters(self, content):
        if self.parent == 'pagetitle':
            self.current.title = quote(content)
        elif self.parent == 'pageID':
            self.current.idnum = content
        elif self.parent in ['contributorID', 'username', 'comment']:
            self.current.revisions[-1][self.parent]=quote(content)

    def endDocument(self):
        if self.verbose: print("   cool stats: ", self._elems, "elements.")
        #if self.verbose: print("   There were", self._attrs, "attributes.")
        return self.pages


one_authorize.py

[edit]

All-in-one for creating a tally of how many edits each author has made (on the assumption of a complete and non-redundant set of csv pagesets) and for removing pages from a pageset based on user editcounts parser.py is used to load and call the classes and functions in here, usually.

from wiki_pageset import PageHistory
from math import ceil, floor
from time import time
import re,operator,os,sys

import utility
        
    

class EditsByUser():

    def __init__(self, verbose=0):
        self.verbose=verbose
        #if self.verbose==2: print("  Connecting to sqlite database of userid edit tables.")
        #sqlite database with a single table with userid as primary key and username as the other value
        #PageFilter.edit_bases={}
        #PageFilter.edit_cursors={}
        #for i in range(1000):
        #    PageFilter.edit_bases[i], PageFilter.edit_cursors[i]= \
        #     pageparser_db.connect_editcount_base(i)

        self.interval_dicts_done=0
        self.ip_regex=re.compile('^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$')
        
    def ip_to_int(self, valuelist):
        return int(valuelist[0])*16777216+\
                int(valuelist[1])*65536+\
                int(valuelist[2])*256+\
                int(valuelist[3])
                
    def int_to_ipstr(self, number):        
        ip= [((number%(256**4))/256**3),
            ((number%(256**3))/256**2),
            ((number%(256**2))/256**1),
            ((number%(256**1))/256**0)]
        return '.'.join([str(x) for x in ip])
        
    def get_edits_by_user(self, pageset):
        #get data of pageset
        if self.verbose: print("  organizing editor data for storage")
        ip_list={}
        id_list={}
        for page in pageset:
            for revision in page.revisions:
                userid=revision['contributorID']
                is_ip=re.findall(self.ip_regex,userid)
                if is_ip:
                    userid=self.ip_to_int(is_ip[0])
                    if userid not in ip_list:
                        ip_list[userid]=1
                    else: ip_list[userid]+=1
                elif re.search('^\d+$', userid):
                    userid=int(userid)
                    if userid not in id_list:
                        id_list[userid]=1
                    else: id_list[userid]+=1
        id_list=[[x,id_list[x]] for x in sorted(id_list)]
        ip_list=[[x,ip_list[x]] for x in sorted(ip_list)]
        return id_list,ip_list
   
    def interval_dicts(self):
        self.id_dict = {
            'upper':10000000, #in reality, currently users peak at 8mili, but at least for the next year and a half or so it'll stay under ten mill, prob.
            'lower':1,
            'ext':'ids'
                }
        self.ip_dict = {
            'upper':4294967296,
            'lower':16777216,
            'ext':'ips'
        }
        self.user_dicts={'ip':self.ip_dict,'id':self.id_dict}
        for user_dict in [self.ip_dict, self.id_dict]:
            user_dict['interval']=ceil(float(user_dict['upper']-user_dict['lower'])/100)
            user_dict['user_blocks']=[]
            user_dict['input_files']={} #although it's not inconceivable that base-10 IPs and IDs could be stored in harmony in the same file, I suspect that, barring some kind of apocalyptic kinda thing, or peak oil, the number of editors will double in the next decade, resulting in the inevitable collision. While adjusting the upper limits of users is a predictable problem, this is something which would be hard to figure out. yah like this script is going 4 ten years.
            for i in range(100):
                user_dict['user_blocks'].append(i*user_dict['interval'])
    
    def fill_edit_db(self, input_files=[], editcount_folder='/opt/editcounts/'):
        if self.interval_dicts_done==0:
            self.interval_dicts()
        #input files: a list of valid file addresses, each of which either contains a list of base-10 IPs or wikipedia editor IDs with a number of edits next to it.
        #editcount_folder - the folder to put the total counts.
        if self.verbose: print('  Categorizing input editcount files')
        for filename in input_files: #all files are assumed to exist at this point, and be a 
            boxed=False
            for user_dict in [self.ip_dict, self.id_dict]:
                if user_dict['ext'] in filename:
                    boxed=True
                    user_dict['input_files'][filename]=0
            if not boxed:
                print("Error! The filename", filename, " is not clearly distinguishable as either an ip or userid editcount file.")
        for user_dict in [self.id_dict]:
            if len(user_dict['input_files'])==0:
                if self.verbose: print('  Beginning editcount set ' + user_dict['ext'])
                if self.verbose: print('  Found no files which contained editcounts by ' + user_dict['ext'])
                continue
            for block_num,block in enumerate(user_dict['user_blocks']): #ranges of possible user ids or ips
                #for block_num in range(23,24): #ranges of possible user ids or ips
                if self.verbose: print('   starting new block', block_num, 'out of 100 blocks...')
                loc_block=editcount_folder+'edits.'+user_dict['ext']+'.'+str(block)+'.txt'
                block_data={}
                if os.path.isfile(loc_block):
                    if self.verbose: print('   loading old block data', loc_block)
                    unformatted=[[int(x),int(y)] for x,y in utility.csv_read(loc_block)]
                    block_data=dict(unformatted)
                i=0
                timer={'open/seek':0,'tell':0,'readline':0,
                    'interpret':0,'compare':0, 'incl':0}
                for filename in sorted(user_dict['input_files']):
                    i+=1
                    if i%100==0 and self.verbose==2:
                        print(os.path.basename(filename))
                        for item in timer:
                            print("   ", item, "  |  ", str(timer[item]))
                    tmptime=time()
                    f_source = open(filename,'r')
                    f_source.seek(user_dict['input_files'][filename])
                    timer['open/seek']+=(time()-tmptime)
                    while True:
                        tmptime=time()
                        timer['tell']+=(time()-tmptime)
                        tmptime=time()
                        user_dict['input_files'][filename]=f_source.tell()
                        data=f_source.read(2000)
                        row_block=data.split('\n')
                        if len(row_block)==1:
                            break
                        if row_block[-1] != '':
                            newdata='bleaugh'
                            while newdata != '\n' and newdata != '':
                                newdata=f_source.read(1)
                                if newdata =='\n':
                                    row_block.append('')
                                else:
                                    row_block[-1]+=newdata
                        breaker=False
                        for row_num in range(len(row_block)):
                            tmptime=time()
                            row=row_block[row_num]
                            if row=='':
                                break
                            user,edits=[int(x) for x in row.split(',')]
                            timer['interpret']+=(time()-tmptime)
                            tmptime=time()
                            #if user==2332919:print(filename, "a",user)
                            if user >= block:
                                if user >= block+user_dict['interval']:
                                    breaker=True
                                    break
                                timer['compare']+=(time()-tmptime)
                                tmptime=time()
                                if user not in block_data:
                                    block_data[user]=edits
                                    #if user==2332919:print("b",user, block_data[user])
                                else:
                                    block_data[user]+=edits
                                    #if user==2332919:print("c",user, block_data[user])
                                timer['incl']+=(time()-tmptime)
                            #if user==2332919:print("d",user, block_data[user])
                        if breaker: break  
                    f_source.close()
                writable = sorted(block_data.items(),key=operator.itemgetter(0))
                f_block=open(loc_block, 'w')
                for item in writable:
                    f_block.write(str(item[0])+','+str(item[1])+'\n')
                f_block.flush()
                f_block.close()
                safety_valve_progress=editcount_folder+\
                 'safety_valve_progress.'+ user_dict['ext'] + str(block) + '.txt'
                utility.csv_write(safety_valve_progress,
                 sorted(user_dict['input_files'],key=operator.itemgetter(0)))

    def activate_gt(self, ips_gt, ids_gt):
        try:
            if self.gt:
                return True
        except:
            self.gt={'ip':[int(x.rstrip()) for x in open(ips_gt,'r')],
                'id':[int(x.rstrip()) for x in open(ids_gt,'r')]}
                
    def get_inx_pages(self, pagelist, 
        limit=50, ips_gt=None, ids_gt=None):
        """
        pagelist=just any list of lists where the last element of each itemlist is a str userid or a str base-256 ip addr.
        if the user or ip is found to be inexperienced,
        all elements but the last element are included as one of many in a results list.
        ips_gt=sorted list of base-10 ips with a number of edits
                    that exceed the number of edits that qualify
                    them as 'experienced,' and thus should return a false value.
        ips_lt=sorted list of userids, same as above
        limit = not implemented yet. in future, will automate creation
            and use of ips_gt, ips_lt from the folder where editcounts were tallied by user.
        """
        #returns a list of only the pages which have *less* edits than the limit
        
        results=[]
        pagelist2={'ip':[],'id':[]}
        for page in pagelist:
            userid=page[-1]
            is_ip=re.findall(self.ip_regex,userid)
            if is_ip:
                page[-1]=self.ip_to_int(is_ip[0])
                pagelist2['ip'].append(page)
            elif re.search('^\d+$', userid):
                page[-1]=int(userid)
                pagelist2['id'].append(page)
        for setname in ['ip','id']:
            pagelist2[setname]=sorted(pagelist2[setname],key=operator.itemgetter(-1))
            users_shadow=[x[-1] for x in pagelist2[setname]]
            inx_list=self.has_less_edits_than(setname=setname,
                usernames=users_shadow,ips_gt=ips_gt,ids_gt=ids_gt)
            for i in range(len(inx_list)):
                if inx_list[i]:
                    results.append(pagelist2[setname][i])
        return results

    def has_less_edits_than(self, setname='ip',
        usernames=[], ips_gt=None, ids_gt=None):
        """
        usernames = list of names to test. Returned list of bools based on test.
        ips_gt=sorted list of base-10 ips with a number of edits
                    that exceed the number of edits that qualify
                    them as 'experienced,' and thus should return a false value.
        ips_lt=sorted list of userids, same as above
        limit = not implemented yet. in future, will automate creation
            and use of ips_gt, ips_lt from the folder where editcounts were tallied by user.
        """
        #returns a list of only the users which have *less* edits than the limit
        self.activate_gt(ips_gt,ids_gt)
        results=[]
        userlist=usernames
        len_userlist=len(userlist)
        gtlist=sorted(self.gt[setname]) #both gtlist and userlist should supposedly be sorted and of the same type by this line, making the following algorithm pretty efficient.
        len_gtlist=len(gtlist)
        user_cursor=0
        gt_cursor=0
        last_res=0
        print(setname,len_userlist)
        bcs=0
        ds=0
        while user_cursor!=len_userlist:
            user=userlist[user_cursor]
            print(len_userlist, user_cursor, len_gtlist,gt_cursor)
            gtpos=gtlist[gt_cursor]
            #if user==104025: print('a',user,gtpos,user_cursor,gt_cursor)
            #104523
            if gtpos < user:
                if last_res==-1:
                    user_cursor+=1
                    results.append(True)
                    last_res=0
                    #if user==104025: print('bI',user,gtpos,user_cursor,gt_cursor)
                    bcs+=1
                else:
                    if gt_cursor+1<len_gtlist: 
                        gt_cursor+=1
                        last_res=1
                    else:
                        user_cursor+=1
                    #if user==104025: print('bII',user,gtpos,user_cursor,gt_cursor)
            elif gtpos > user:
                if last_res==1:
                    user_cursor+=1
                    results.append(True)
                    last_res=0
                    #if user==104025: print('cI',user,gtpos,user_cursor,gt_cursor)
                    bcs+=1
                else:
                    if gt_cursor>0:
                        gt_cursor-=1
                        last_res=-1
                    else:
                        user_cursor+=1
                    #if user==104025: print('cII',user,gtpos,user_cursor,gt_cursor)
            elif gtpos == user:
                results.append(False)
                user_cursor+=1
                last_res=0
                #if user==104025: print('d',user,gtpos,user_cursor,gt_cursor)
                ds+=1
        print('d',ds,'bc',bcs)
        return results

utility.py

[edit]

I know, I know, more descriptive names, I'll give it one. This is just a set of toolbox functions I typically carry with me everywhere

#utility.py
#V1

DEBUG=True
import pickle, textwrap, os, csv
from glob import glob

def pickle_data(file_addr, data):
    f_pickle=open(file_addr,'wb')
    pickle.dump(data, f_pickle)
    f_pickle.flush()
    f_pickle.close()

def unpickle_data(file_addr, defaultobject=None):
    if os.access(file_addr, os.R_OK):
        return pickle.load(open(file_addr,'rb'))
    else:
        data=defaultobject
        pickle_data(file_addr,data)
        return data

def flatten_list(list_item):
     product=list()
     for x in list_item:
             if type(x) != list:
                     product.append(x)
             elif list in [type(y) for y in x]:
                     product.extend(flatten_list(x))
             else:
                     product.extend(x)
     return product

def glob_list(args1):
    args2=[]
    for arg in args1:
        args2.extend(glob(arg))
    return args2

def dbgmsg(text,links=False):
    if DEBUG:
        if links:
            print(" DEBUG: " + text)
        else:
            print(textwrap.fill(" DEBUG: " + text))

def csv_write(filename, rowlist):
    f_csv=open(filename,'w')
    writer=csv.writer(f_csv)
    writer.writerows(rowlist)
    f_csv.flush()
    f_csv.close()
    return True

def csv_read(filename, isfile=True):
    if isfile:
        f_csv=open(filename, 'r')
        reader=csv.reader(f_csv)
    else:
        reader=csv.reader(filename)
    rowlist=[]
    for row in reader:
        rowlist.append(row)
    del reader
    if isfile: f_csv.close()
    return rowlist


serch.py

[edit]

this is the way to update editor data from the website realtime. Incredibly slow, and server heavy. That's why you only use this on the list of pages which had a single editor as of your most recent version of the stub-meta-history file. Because then it is about 1/26th the number of files to check and it doesn't take several months and dozens of gb of transfer.

#!/bin/python
#tool for checking real time from a list of wikipage titles
#whether the page has more than one contributor,
#is a redirect, or has templates, and such things.
#but because this tool is rather slow and heavy on
#the server load... better to use it on small list
#of wikipages just to keep them up2date.

import csv
from urllib.parse import quote
import os
import sys
import re
from hashlib import md5
from utility import *

def wget(link,outfile):
    os.system('wget -q "' + link + '" -O "' + outfile + '"')

def make_urls():
    #URL addresses for finding out information about pages.
    
    url_book = {
     
     'current' : {
        'prefix':'http://wiki.riteme.site/w/index.php?title=Special:Export&pages=',
        'suffix':'&limit=1&action=submit'
     },
     
     'data' : {
        'prefix':'http://wiki.riteme.site/w/index.php?title=Special:Export&pages=',
        'suffix':'&limit=10&action=submit&history'
     }
    }
    return url_book

def get_specific_link(url_book, pagename):
    link = dict()
    for linkaddr in ['current','data']:
        link[linkaddr]=url_book[linkaddr]['prefix'] + \
            pagename + url_book[linkaddr]['suffix']
    return link

def read_link(url_to_get, localaddr):
    #returns file handle of a page
    #downloaded from the internet
    #to location 'localaddr'.
    os.system('rm ' + localaddr)
    wget(url_to_get,localaddr)
    pagesrc = open(localaddr,'r')
    return pagesrc

"""class HistoryChecker():
    def __init__(self):
    
    def load_from_web(self, web_addr):
        dbgmsg('getting contributors')
        f_contrib=read_link(web_addr,'/tmp/contrib.txt')
        dbgmsg('done')
        self.contrbrs=f_contrib.readlines()
        self.contrbrs=[re.sub('^.*?\t(.*?)\t.*','\g<1>',x).rstrip() for x in self.contrbrs]
        self.contrbrs=self.de_bot(self.contrbrs)
        f_contrib.close()
        return True
    
    def gauntlet(self, level=0):
        if level >=0:
            for test in [self.check_max_editors,
                        self.check_min_editors]:
                if not test(self.contrbrs): return False
        #if level >=1:
        #    for test in [self.check_editor_bg]:
        #        if not test(): return False
        #if level >=2:
        #    pass
        dbgmsg("PASSED level " + str(level) + " contributor check.")
        return True"""
    


class ContentChecker():
    def __init__(self):
        f_bot=open('bot_list.txt', 'r')
        self.bot_list = f_bot.readlines()
        self.bot_list=[x.rstrip().lower() for x in self.bot_list]

    def test_if_redirect(self, pagename, web_addr):
        f_page = read_link(web_addr,'/tmp/x.xml')
        data=f_page.read()[:2750]
        if not re.search('<title>(.+?)</title>',data):
            self.is_not_redirect=False #okay, well technically it's probably a defunct page, but whatever, nomenclature later...
            return self.is_not_redirect
        if quote(re.search('<title>(.+?)</title>',data).group(1)) == pagename:
            if not re.search(">\s*\#redirect(\s|$)", data.lower()):
                print("not a redirect")
                self.is_not_redirect=True
                return self.is_not_redirect
        print("a redirect")
        self.is_not_redirect=False
        return self.is_not_redirect
    
    def load_from_web(self, web_addr):
        if self.is_not_redirect:
            dbgmsg('getting content') #if we wanted to read content
                #from database, this is where we'd do it instead.
                #the parameter would be something like pagename instead.
            f_page = read_link(web_addr,'/tmp/x.xml')
            self.data = f_page.read().lower()
            self.editors=self.get_editors(self.data)
        else:
            self.data =''
            self.editors=''
        return True

    def gauntlet(self, level=0):
        if not self.is_not_redirect: return False
        if level>=0:
            for test in [self.check_still_exists]:
                        #self.check_not_redirect]:
                if not test(self.data):return False
            for test in [self.check_max_editors]:
                if not test(self.editors):return False
        if level>=1:
            for test in [self.check_no_template]:
                if not test(self.data): return False
        if level>=2:
            pass
        dbgmsg("PASSED level " + str(level) + " content check.")
        return True

    def de_bot(self, usernames):
        usernames2=[]
        for name in usernames:
            if 'bot' not in name[-5:].lower() and \
              name not in self.bot_list:
                usernames2.append(name)
        return usernames2
    
    #def check_not_redirect(self, pagedata):
    #    if re.search("\n\s*\#redirect(\s|$)", pagedata):
    #        dbgmsg("X: wiki page is a redirect")
    #        return False
    #    return True
    
    def get_editors(self,pagedata, revision_count=9):
        #suggested: pagedata incl at least 5 revisions
        pagedata2=pagedata.split('\n')
        editors=set()
        contributor_block=False
        for line in pagedata2:
            if '<contributor>' in line:
                contributor_block=True
            elif not contributor_block:
                continue
            elif '</contributor>' in line:
                contributor_block=False
                if len(editors)==revision_count:
                    print(repr(editors))
                    break
            elif '<username>' in line:
                editors.add(re.sub('^\s*<username>(((?!username>).)*)</username>\s*$','\g<1>',line))
            elif '<ip>' in line:
                editors.add(re.sub('^\s*<ip>(((?!ip>).)*)</ip>\s*$','\g<1>',line))
        return self.de_bot(editors)

    def check_still_exists(self, pagedata):
        #this is only useful if our source of content data  
        #is more recent than our page title list. Say if we're getting
        #content live from wikipedia's "special:export" function.
        pagehash=md5(self.data.encode())
        if pagehash.hexdigest() in ['caa3fe485e6f6518af1e5ea59e131f68','3a98a2e740d741a7750f034a99e70025','f8f49e37b4c4bff5ecac639237a0129f']:
        #the hash of the uppercased XML returned when you use the URL
        #of a non-existent page.
            dbgmsg("X: wiki page no longer exists")
            return False
        else:
            print(pagehash.hexdigest())
        return True
    
    def check_no_template(self, pagedata):
        if re.search("{{", pagedata):
            dbgmsg("X: has a template")
            return False
        return True

    def check_max_editors(self, contributors):
        if len(contributors) > 1:
            dbgmsg("X: >1 contributors")
            print(repr(contributors))
            return False
        print(repr(contributors))
        return True
    
    def check_min_editors(self,contributors):
        #this test may be excluded if you think it's
        #important to check bot created pages for sanity
        if len(contributors)==0:
            dbgmsg("X: only bot contributors")
            return False
        return True


def main(titlefile):
    loc_addrfile='stored_data.pickle'
    lastloc=unpickle_data(loc_addrfile,0)
    url_book=make_urls()
    
    loc_output='./results/results'
    #loop is designed around iterating through the title file,
    #not through a variable holding all its data.
    #this means we can loop thru large title files (which would
    #freeze us up if put in memory.
    f_titles = open(titlefile, 'r')
    f_titles.seek(0, 2) #find the byte address of the end of file.
    loc_end_of_file=f_titles.tell()
    f_titles.seek(lastloc)
    l=0

    while f_titles.tell() < loc_end_of_file:
        one_author_only=[] #temp repository of pages that we've found
            #to have one author
        handful = [] #handful of pages to check
        dbgmsg("getting titles")
        for i in range(0,100):
            line=f_titles.readline()
            if line:
                handful.append(line.rstrip()) #assumes title list is already quoted
        
        contentcheck=ContentChecker()
        #historycheck=HistoryChecker()
        for pagename in handful:
            #build the URL addresses for getting data about the page
            #makes link['h'] -> 'http://...' (url for page history)
            link = get_specific_link(url_book, pagename)
            dbgmsg(str(lastloc)+'page addr:' + link['current'],links=True)   
            valid=contentcheck.test_if_redirect(pagename=pagename,web_addr=link['current'])
            if not valid: continue
            dbgmsg(str(lastloc)+'page addr:' + link['data'],links=True) 
            contentcheck.load_from_web(link['data'])
            valid = contentcheck.gauntlet()
            if not valid: continue #next pagename
            #historycheck.load_from_web(link['contrib'])
            #valid = historycheck.gauntlet()
            #if not valid: continue #next pagename
            one_author_only.append(pagename+'\n')
        dbgmsg("adding new data")
        
        f_results=open(loc_output+str(lastloc)+'.txt','w')
        f_results.writelines(one_author_only)
        f_results.flush()
        f_results.close()
        lastloc=int(f_titles.tell())

        print('read 50 pages\' history, of which ',
        str(len(one_author_only)),
        ' met conditions. We are at:', lastloc)

        dbgmsg("storing data")
        pickle_data(loc_addrfile, lastloc)
        l+=1
if __name__ == '__main__':
    main(titlefile=sys.argv[1])
    

"""

This program updates via the internet all the suspected one author pages to see whether it's true. It breaks the list down into a bunch of files is the results folder. The list contains all the files which really seem to be one author only still. Concatenate them into one file after by doing...

python3.0 serch.py
cd results
cat *.txt > ../one_author_pages.title

You'll probably want to change this into a pageset so you can remove pages with experienced authors, so here we go, here's how to work backwords and do that.

cd ..
python
import utility
data_based=utility.csv_read('one_author_pages_prelim.csv')
int_based=open('one_author_pages.title')
r2=[x.rstrip() for x in int_based.readlines()]
dictform={}
for page in data_based:
 	dictform[page[0]]=page[1:]    
for page in r2:
 	if page not in dictform:
 		print(page)) #should return none, as int_based was just a narrowing down of data_based
newcsv=[]
for page in r2:
 	a=[page]           
 	a.extend(dictform[page])
 	newcsv.append(a)
utility.csv_write('One_author_Pageset.csv',newcsv)
"""

shell commands

[edit]

a couple of shell commands I made use of... I need to integrate these into the code, even though it will take more lines when using python. But basically they seem random and unintuitive but they're mostly for quickly converting from pageset to title list or dealing with editcount stuff.

#from /opt/editcounts/*
grep -hiE '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ids.*.txt > ids_lt_99_edits
grep -hiEv '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ids.*.txt > ids_gt_99_edits
grep -hiE '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ips.*.txt > ips_lt_99_edits
grep -hiEv '^[0123456789]+\,([0123456789]{1,2})\s*$' edits.ips.*.txt > ips_gt_99_edits

grep -E "^[^,]+,[0-9]+\s*$" limited.txt > updated_list_as_pageset_bot_made_only
grep -Ev "^[^,]+,[0-9]+\s*$" limited.txt > updated_list_as_pageset_with_humans

sed -r 's/,[0-9]+\s*$//g' ips_gt_99_edits > iplist_gt_edits 
#just a list of base 10 ips, doesn't include editcounts

sed -r 's/,[0-9]+\s*$//g' ids_gt_99_edits > idlist_gt_edits 
#just a list of ids, doesn't include editcounts

get_redirects.py

[edit]

deals with the enwiki-pages.sql file to get a list of redirects for wiki_pageset.py usually called on its own with a little bit of customization.

import re, random, sqlite3
import pageparser_db
from urllib.parse import quote
d=open('enwiki-20081008-page.sql')
#d=open('page.sql')
redirects = []
d.seek(0,2)
eof_loc = d.tell()
d.seek(0)
i=0
#base, cu=pageparser_db.connect_redirect_base()
f_r=open('redirect_list','w')
#initial page id only? i dunno, seems like it might be good to check for both though, cause this definitely removed some when I used it initially.
"""
while d.tell() < eof_loc:
     content=d.read(1000000)
     redirect_data=re.findall("\((\d+),\d+,\'.+?\',\'.*?\',\d+,(\d)", content)
     for article in redirect_data:
            if int(article[1])==1:
                if random.randint(1,10000)==500:
                     redirects.append(article[0])
     del redirect_data
     i+=1
     print("ahoy", str(i))
"""

#title paired with is_redirect
while d.tell() < eof_loc:
     content=d.read(40000000)
     redirect_data=re.findall("\(\d+,\d+,\'(.+?)\',\'.*?\',\d+,1", content)
     for i in range(len(redirect_data)):
         redirect_data[i]=quote(re.sub('_', ' ', redirect_data[i]))+'\n'
     f_r.writelines(redirect_data)     
     del redirect_data
     i+=1
     if i>5:
         f_r.flush()
     print("ahoy", str(i))
     print("we are at", str(int(100*d.tell()/float(eof_loc))), "%...")


"""
#here, the redirects field comes before the page_latest_id field,
#so we use article 0.
while d.tell() < eof_loc:
     content=d.read(40000000)
     redirect_data=re.findall("\(\d+,\d+,'.+?','.*?',\d+,1,\d+,[\d\.]+?,'\d+?',(\d+)", content)
     for i in range(len(redirect_data)):
         redirect_data[i]=redirect_data[i]+'\n'
     f_r.writelines(redirect_data)
     del redirect_data
     i+=1
     if i>5:
         f_r.flush()
     print("we are at", str(int(100*d.tell()/float(eof_loc))), "%...")
"""

f_r.flush()
f_r.close()

list of bots

[edit]

bot list used can be found here. tho you'll probably want the more recent version from the category page.