Wikipedia:WikiProject Wikidemia/Quant/Code/parsexml
Appearance
#!/bin/tcsh setenv WDIR ~tobacman/bulk/data/wiki/dumps setenv WFILE idwiki-20060506-pages-meta-history.xml date rm -f $WDIR/headers.raw1 $WDIR/headers.raw2 $WDIR/headers.raw # grep the headers sed -e '/<timestamp>/b' -e '/<ip>/b' -e '/<username>/b' -e '/<title>/b' -e '/<comment>/b' -e '/<id>/b' -e \ '/<minor \>/b' -e '/<revision>/b' -e d $WDIR/$WFILE > $WDIR/headers.raw1 # delete BOTH leading and trailing whitespace from each line sed 's/^[ \t]*//;s/[ \t]*$//' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 # substitute "foo" with "bar" ONLY for lines which contain "baz" sed '/<comment>/s/,//g' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed '/<title>/s/,//g' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed '/<timestamp>/s/T/,/g;/<timestamp>/s/Z//' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 # if a line begins with an equal sign, append it to the previous line # and replace the "=" with a single space # sed -e :a -e '$!N;s/\n=/ /;ta' -e 'P;D' # <timestamp> <ip> <username> <title> <comment> <id> <minor \> <revision> sed -e :a -e '$\!N;s/\n<id>/,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed -e :a -e '$\!N;s/\n<timestamp>/,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed -e :a -e '$\!N;s/\n<ip>/,ip,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed -e :a -e '$\!N;s/\n<username>/,name,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 sed -e :a -e '$\!N;s/\n<comment>/,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 # Put in ,-1, when it's a minor edit. sed -e :a -e '$\!N;s/\n<minor \>/,-1,/;ta' -e 'P;D' $WDIR/headers.raw1 > $WDIR/headers.raw2 mv -f $WDIR/headers.raw2 $WDIR/headers.raw1 # remove most HTML tags (accommodates multiple-line tags) sed -e :a -e 's/<[^>]*>//g;/</N;//ba' $WDIR/headers.raw1 > $WDIR/headers.raw rm -f $WDIR/headers.raw1 date exit