Jump to content

User:Umeboshi/Tools/enwiki-xml-splitter

From Wikipedia, the free encyclopedia
#!/usr/bin/python
# This is a script to help split the large xml database dump
# Use 7-zip to extract to stdout -- 7z e -so /path/to/archive.7z | enwiki-xml-splitter
# The page nodes will be extracted into the current directory until the pages per archive
# limit is reached.  Then those pages are put in a new 7z archive and removed.
# Both page and archive filenames use 9 digit zero padded numbers.
# Arguments to the -z option need to be quoted.

import os, sys
from hashlib import md5
#from xattr import xattr
from optparse import OptionParser
import xml.parsers.expat
import codecs
usage = """usage: %prog [options]
This is a script to help split the large xml database dump
Use 7-zip to extract to stdout -- 7z e -so /path/to/archive.7z | enwiki-xml-splitter
The page nodes will be extracted into the current directory until the pages per archive
limit is reached.  Then those pages are put in a new 7z archive and removed.
Both page and archive filenames use 9 digit zero padded numbers.
Arguments to the -z option need to be quoted.
The index file is a text file that matches page titles to page-#.xml files and their
respective archives.
"""
parser = OptionParser(usage=usage)

parser.add_option('-v', '--verbose', action='store_true', dest='verbose',
                  default=False, help="this does absolutely nothing")

parser.add_option('--archive-prefix', action='store', dest='archive_prefix',
                  default='enwiki-archive', help="prefix for archive filenames")

parser.add_option('--index-file', action='store', dest='index_filename',
                  default='enwiki-indexfile', help="filename for indexfile")

parser.add_option('--archive-path', action='store', dest='archive_path',
                  default='', help="path to place archives and index in (default .)")

parser.add_option('-p', '--pages-per-archive', action='store', dest='pages_per_archive',
                  default=10, type=int)

parser.add_option('-z', '--zipcmd', action='store', dest='zipcmd',
                  default='7z a -t7z -mfb=64 -mx=7')

parser.add_option('-k', '--keep-pages', action='store_false', dest='remove_pages',
                  default=True)

opts, args = parser.parse_args(sys.argv[1:])

if opts.archive_path:
    archive_prefix = os.path.join(opts.archive_path, opts.archive_prefix)
    index_filename = os.path.join(opts.archive_path, opts.index_filename)
else:
    archive_prefix = opts.archive_prefix
    index_filename = opts.index_filename
    
zipcmd = opts.zipcmd
pages_per_archive = opts.pages_per_archive


def archivefilename(archivenum):
    return '%s-%09d.7z' % (archive_prefix, archivenum)

def pagefilename(pagenum):
    return 'page-%09d.xml' % pagenum

def new_pagefile(pagenum):
    filename = pagefilename(pagenum)
    return codecs.open(pagefilename(pagenum), 'w', encoding='utf8')

def make_indexline(archivenum, pagenum, title):
    pfilename = pagefilename(pagenum)
    afilename = os.path.basename(archivefilename(archivenum))
    return '%s,%s:\t%s\n' % (afilename, pfilename, title)

def archive_pagefile(pagenum, archivenum, remove=True):
    pfilename = pagefilename(pagenum)
    afilename = archivefilename(archivenum)
    print 'archiving file %s to archive %s' % (pfilename, afilename)
    cmd = '%s %s %s' % (zipcmd, afilename, pfilename)
    os.system(cmd)
    if remove:
        os.remove(pfilename)

def archive_pagefiles(archivenum, remove=True):
    afilename = archivefilename(archivenum)
    if os.path.exists(afilename):
        print 'skipping archive %s' % afilename
    else:
        print 'creating archive %s' % afilename
        cmd = '%s %s page-*.xml' % (zipcmd, afilename)
        os.system(cmd)
    if remove:
        os.system('rm -f page-*.xml')
    
class ParserHandler(object):
    def __init__(self):
        self.pagenum = 1
        self.archivenum = 1
        self.outfile = new_pagefile(self.pagenum)
        self.indexfile = codecs.open(index_filename, 'a', encoding='utf8')
        self.inpage = False
        self.intitle = False
        self.current_title = None

    def _current_archive_exists(self):
        return os.path.exists(archivefilename(self.archivenum))
    
    def start_element(self, name, attrs):
        if name == 'page':
            self.pagenum += 1
            self.inpage = True
            if not (self.pagenum - 1) % pages_per_archive:
                archive_pagefiles(self.archivenum, remove=opts.remove_pages)
                self.archivenum += 1
            # make sure empty file stays out of archive
            if not self._current_archive_exists():
                self.outfile = new_pagefile(self.pagenum)
        elif name == 'title':
            self.intitle = True

        if not self._current_archive_exists():
            attlist = ['%s=%s' % (k,v) for k,v in attrs.items()]
            attributes = ''
            if len(attlist):
                attributes = ' '.join(attlist)
            tag = name
            if attributes:
                tag = '%s %s' % (name, attributes)
            self.outfile.write('<%s>\n' % tag)

    def end_element(self, name):
        if not self._current_archive_exists():
            self.outfile.write('</%s>' % name)
        if name == 'page':
            self.inpage = False
            if not self._current_archive_exists():
                print 'indexing', self.current_title
                indexline = make_indexline(self.archivenum, self.pagenum, self.current_title)
                self.indexfile.write(indexline)
            else:
                print 'skipping', self.current_title
            self.current_title = None
        if name == 'title':
            self.intitle = False
                    
    def char_data(self, data):
        if self.intitle:
            if self.current_title is None:
                self.current_title = data
            else:
                self.current_title += data
        if not self._current_archive_exists():
            self.outfile.write(data)

ph = ParserHandler()
p = xml.parsers.expat.ParserCreate()

p.StartElementHandler = ph.start_element
p.EndElementHandler = ph.end_element
p.CharacterDataHandler = ph.char_data

infile = sys.stdin
p.ParseFile(infile)