Wikipedia:Persondata/transform.pl
Appearance
#!/usr/bin/perl -w
use POSIX;
##########################
# Transform Persondata
##########################
# Parse date entry and expand each date field to 8 fields
# Likewise the articles about places and first/last names
##########################
my %months=(
January => 1,
February => 2,
March => 3,
April => 4,
May => 5,
June => 6,
July => 7,
August => 8,
September => 9,
October => 10,
November => 11,
December => 12
);
my %monthname = reverse %months;
my $MONTH = "(" . join("|",keys %months) . ")";
my $YEAR = "(\\d{1,4})";
my $PREFIX = "([^0-9]+)";
# Clean up date entry for standardised persondata date format
sub clean_date {
my $d = shift;
# Put spaces between wikilinks
$d =~ s/([\]])([\[])/$1 $2/g;
# Deal with piped links
$d =~ s/[\[]{2}([^\]]*)[|](.*)[\]]{2}/$2/g;
# Remove square brackets
$d =~ s/[\[\]]//g;
# Deal with templates
$d =~ s/{{([Bb]irth|[Dd]eath) date( and age)?\s?\|\s?(\d{1,4})\|\s?(\d{1,2})\|\s?(\d{1,2})(.*)/$3-$4-$5/g;
# DD.MM.YYYY => DD MMM YYYY
if ($d =~ /^0?(\d{1,2})\.(\d{1,2})\.(\d{1,4})$/) {
my $month = int $2;
$d = "$1 ".$monthname{$month}." $3";
}
# YYYY-MM-DD => DD MMM YYYY
if ($d =~ /^(\d{1,4})-(\d{1,2})-0?(\d{1,2})$/) {
my $month = int $2;
$d = "$3 ".$monthname{$month}." $1";
}
# AD/CE is implicit
$d =~ s/A\. ?D\.//; # TODO: doesn't completely work?? e.g. [[Libanios]]
$d =~ s/A ?D//;
$d =~ s/\bC\. ?E\.//;
$d =~ s/\bC ?E//;
# Remove trailing punctuation
$d =~ s/(,|\||;|=)$//;
# Remove HTML comments
$d =~ s/<!--(.*)-->//;
# Add forgotten spaces
$d =~ s/([a-z])(\d)/$1 $2/g;
$d =~ s/(\d)([a-z])/$1 $2/g;
$d =~ s/\.([^ ])/. $1/g;
#Remove spaces from ordinals
$d =~ s/(\d) (th|st|nd|rd|s\b)/$1$2/g;
# remove double spaces
$d =~ s/\s+/ /g;
# remove spaces at beginning and end
$d =~ s/^\s+//;
$d =~ s/\s+$//;
# Fields with only question marks => "unknown"
$d =~ s/^\?$/unknown/;
# Uniform capitalisation
$d =~ s/Unknown/unknown/g;
$d =~ s/After/after/g;
# question marks should always have brackets and a space before them
$d =~ s/\(\?\)/\?/g;
$d =~ s/([^ ])\?+/$1 \?/g;
$d =~ s/\?/(\?)/g;
# Instead of a question mark at the end, 'probably' at the beginning
$d =~ s/^(.+) \(\?\)$/probably $1/;
$d =~ s/^\(\?\)$/probably/;
# Slash without space
$d =~ s/ ?\/ ?/\//g;
# Remove bolding/italics
$d =~ s/\'{2,5}//g;
# Shortened month names
$d =~ s/Jan[\.\s]+/January /;
$d =~ s/Feb[\.\s]+/February /;
$d =~ s/Mar[\.\s]+/March /;
$d =~ s/Apr[\.\s]+/April /;
$d =~ s/Jun[\.\s]+/June /;
$d =~ s/Jul[\.\s]+/July /;
$d =~ s/Aug[\.\s]+/August /;
$d =~ s/Sep[\.\s]+/September /;
$d =~ s/Sept[\.\s]+/September /;
$d =~ s/Oct[\.\s]+/October /;
$d =~ s/Nov[\.\s]+/November /;
$d =~ s/Dec[\.\s]+/December /;
# Write out "Century"
$d =~ s/C\./century/g;
# "End of the 5th century" => "End 5th century" (simpler)
$d =~ s/ of the//;
$d =~ s/(approx\.?|[Cc]irca\.?\s?|\bca?[\.\s]+|about|around|~)/circa /;
$d =~ s/([Pp]ossibly|[Pp]robably)/probably/g;
$d =~ s/([Ss]till )?[Ll]iving( [Pp]erson)?//;
$d =~ s/[Nn]\/?[Aa]//;
$d =~ s/[Nn]ot [Aa]pplicable//;
# Remove double spaces
$d =~ s/\s+/ /g;
#----------Error fixing---------
# Other typing errors
#$d =~ s/^Um/um/;
#$d =~ s/chr/Chr/;
return $d;
}
sub ilog10 { $x=shift; return int log10($x); }
my @fields = ( "day", "month", "year", "century", "decade", "year1", "year2", "note" );
sub parse_date {
my $date = shift;
my %d;
if (! $date ) {
$d{"note"} = "";
# Normal date entry
} elsif ($date =~ /^$PREFIX?(\d+ )?$MONTH,? $YEAR( BCE?\.?)?$/) {
$d{note} = $1 if defined $1;
$d{day} = substr($2,0,-1) if defined $2;
if (defined $3) {
$d{month} = $months{$3};
}
$d{year} = int $4;
$d{year} = -$d{year} if defined $5; # B.C.
} elsif ($date =~ /^$PREFIX?$MONTH (\d+)?,?\s?$YEAR( BCE?\.?)?$/) {
$d{note} = $1 if defined $1;
$d{day} = substr($3,0) if defined $3;
if (defined $2) {
$d{month} = $months{$2};
}
$d{year} = int $4;
$d{year} = -$d{year} if defined $5; # B.C.
} elsif ($date =~ /^$PREFIX?$YEAR( BCE?\.?)?$/) {
$d{note} = trim($1) if defined $1;
$d{year} = int $2;
$d{year} = -$d{year} if defined $3; # B.C.
# Century
#} elsif ($date =~ /^(around |probably |[Bb]eginning |[Mm]iddle |[Ee]nd )?(\d{1,2})\. [Cc]entury( BCE?\.?)?$/) {
} elsif ($date =~ /^$PREFIX?(\d{1,2})(\.|th|st|nd|rd) [Cc]entury( BCE?\.?)?$/) {
$d{note} = trim($1) if defined $1;
$d{century} = $2;
$d{century} = -$d{century} if defined $4;
# Decade
} elsif ($date =~ /^(\d{1,4}) ?'?s( BCE?\.?)?$/) {
$d{decade} = (int $1 / 10) * 10;
$d{decade} = -$d{decade} if defined $3;
$d{century} = int ($d{decade} / 10) + 1;
} elsif ($date =~ /^$PREFIX?$YEAR or $YEAR$/) { # TODO: or != until, TODO: B.C.
$d{note} = trim($1) if defined $1;
$d{year1} = int $2;
$d{year2} = int $3;
} elsif ($date =~ /^$PREFIX?$YEAR\/(\d{1,4})$/) { # TODO: B.C.
$d{note} = trim($1) if defined $1;
$d{year1} = int $2;
$d{year2} = int $3;
# Example: 1632/33 => 1632/1633
$c = ilog10($d{year1}) - ilog10($d{year2});
if ( $c < 0 ) { # Errr
$d{year1} = "";
$d{year2} = "";
$d{note} = $date;
} else {
$d{year2} = substr($d{year1},0,$c) . $d{year2};
}
# Interval of several years # TODO: not tested!
} elsif ( $date =~ /^$PREFIX?between $YEAR( and )$YEAR( BCE?\.?)?$/ or
$date =~ /^$PREFIX?$YEAR( to |-)$YEAR$( BCE?\.?)?/ )
{
$d{note} = trim($1) if defined $1;
$d{year1} = int $2;
$d{year2} = int $4;
if (defined $5) {
($d{year1}, $d{year2}) = (-$d{year2}, -$d{year1});
}
#} elsif ($date =~ /^$PREFIX?$YEAR( to |-)$YEAR$/) {
# $d{note} = trim($1) if defined $1;
# $d{year1} = int $2;
# $d{year2} = int $4;
} else {
$d{note} = $date;
}
if (defined $d{year1} and defined $d{year2}) {
# if years need to be switched
#if ($d{year1} > $d{year2}) {
# ($d{year1}, $d{year2}) = ($d{year2}, $d{year1});
#}
# if century can be defined
if ( (int $d{year1} / 100) eq (int $d{year2} / 100)) {
$d{century} = (int $d{year1} / 100) + 1;
}
# if decade can be defined
if ( $d{century} and (int $d{year1} / 10) eq (int $d{year2} / 10)) {
$d{decade} = (int $d{year1} / 10) * 10;
}
}
# calculate decade and century
if (defined $d{year}) {
$d{decade} = (int $d{year} / 10) * 10;
}
# calculate century
if (defined $d{decade}) {
if( $d{decade} != 0 ){
$d{century} = (int $d{decade} / 100) + (int $d{decade} / abs $d{decade});
}
else{
if (defined $d{year}) {
$d{century} = (int $d{decade} / 100) + (int $d{year} / abs $d{year});
}
else{
}
}
}
# Initialise undefined fields
foreach my $f (@fields) {
$d{$f} = '' if not defined $d{$f};
}
return %d;
}
sub parse_location {
my $p = trim(shift);
# [[a]] or [[b]] <- so what is it?!
# [[...]]
# [[...]], ...
# [[...]] , ...
# [[...]]/ ... <- preferably not like this
if ($p =~ /^\[\[([^\]]+)\]\].*?$/) {
my $a = $1;
$a =~ s/\|.*$//;
#print "$p|$a\n";
return $a;
} else {
#print "!$p\n";
return "";
}
}
#For PND number (assigned to German-speaking authors).
#Small number of articles on en wiki have this
sub parse_pnd {
my $p = trim(shift);
my $pnd_nr = "";
my $pnd_date = "";
if ($p =~ /^(\d)(\d)(\d)(\d)(\d)(\d)(\d)(\d)([\dX])/) {
my $check = $9;
$check = 10 if $check eq "X";
# pruefziffer
if ( ((2*$1+3*$2+4*$3+5*$4+6*$5+7*$6+8*$7+9*$8) % 11) eq $check ) {
# TODO: Number range 10000000 bis 14999999
if ($1 == 1 && $2>=0 && $2<=4) {
$pnd_nr = "$1$2$3$4$5$6$7$8$9";
}
}
}
if ($p =~ /(\d){1,2}\.(\d{1,2})\.(\d\d\d\d)/) {
$pnd_date = "$3-$2-$1";
}
return ($pnd_nr, $pnd_date);
}
sub unbracket {
my $p = shift;
# Insert missing spaces between wikilinks
$p =~ s/([\]])([\[])/$1 $2/g;
# remove square brackets
$p =~ s/[\[\]]//g;
return $p;
}
sub trim {
my $p = shift;
if ($p) {
# remove empty spaces at beginning and end
$p =~ s/^\s+//;
$p =~ s/\s+$//;
}
return $p;
}
##########################
while(my $line = <>) {
my @pd = split("\t",$line);
chop($pd[-1]); # remove end-of-line
my @pd_transformed;
$pd_transformed[0] = trim($pd[0]); # pd_id
$pd_transformed[1] = trim($pd[1]); # pd_article
$pd_transformed[2] = trim($pd[2]); # pd_name
$pd_transformed[3] = trim($pd[3]); # pd_alternative
$pd_transformed[4] = trim($pd[4]); # pd_description
$pd_transformed[5] = unbracket(trim($pd[5])); # pd_born
$pd_transformed[6] = trim($pd[6]); # pd_born_in
$pd_transformed[7] = unbracket(trim($pd[7])); # pd_died
$pd_transformed[8] = trim($pd[8]); # pd_died_in
$pd_transformed[9] = trim($pd[9]); # pd_pnd
# extract checked pnd-nr and additional date
($pd_transformed[10], $pd_transformed[11]) = parse_pnd($pd[9]); # pnr_nr, pnd_date
if ( trim($pd[2]) =~ /^([^,]+),([^,]+)$/ ) {
$pd_transformed[12] = trim($2); # n_given
$pd_transformed[13] = trim($1); # n_surname
$pd_transformed[14] = ''; # n_suffix
}
elsif ( trim($pd[2]) =~ /^([^,]+),([^,]+),([^,]+)$/ ) {
$pd_transformed[12] = trim($2); # n_given
$pd_transformed[13] = trim($1); # n_surname
$pd_transformed[14] = trim($3); # n_suffix
}
else {
$pd_transformed[12] = ''; # n_given
$pd_transformed[13] = ''; # n_surname
$pd_transformed[14] = ''; # n_suffix
}
$pd_transformed[15] = parse_location($pd[6]); # b_place
$pd_transformed[16] = parse_location($pd[8]); # d_place
my %born = parse_date(clean_date($pd[5]));
$pd_transformed[17] = $born{day}; # b_day
$pd_transformed[18] = $born{month}; # b_month
$pd_transformed[19] = $born{year}; # b_year
$pd_transformed[20] = $born{decade}; # b_decade
$pd_transformed[21] = $born{century}; # b_century
$pd_transformed[22] = $born{year1}; # b_year1
$pd_transformed[23] = $born{year2}; # b_year2
$pd_transformed[24] = $born{note}; # b_note
my %died = parse_date(clean_date($pd[7]));
$pd_transformed[25] = $died{day}; # d_day
$pd_transformed[26] = $died{month}; # d_month
$pd_transformed[27] = $died{year}; # d_year
$pd_transformed[28] = $died{decade}; # d_decade
$pd_transformed[29] = $died{century}; # d_century
$pd_transformed[30] = $died{year1}; # d_year1
$pd_transformed[31] = $died{year2}; # d_year2
$pd_transformed[32] = $died{note}; # d_note
print join("\t",@pd_transformed) . "\n";
}