User:Plastikspork/spider
Appearance
#!/usr/bin/perl # # Retrieve a subdivision place hierarchy from Maplandia # # Initialize use warnings; use strict; use Benchmark; use LWP::UserAgent; my ($mdate); use vars qw($DEBUG $BASEURL); # Set the debug level $DEBUG = 2; # Get the time $mdate = localtime; # Set the base url $BASEURL = "http://www.maplandia.com/"; # Where should we start? my $PLACEURL = "http://www.maplandia.com/burma/"; my $BASETITLE = "Burma"; open OUT, "> ".$BASETITLE.".log"; print "[[".$BASETITLE."]]\n"; print OUT "[[".$BASETITLE."]]\n"; &spider_maplandia( $PLACEURL, "*" ); close(OUT); sub get_http { # Retrieve a requested html page: my ($this_url) = @_; my ($useragent, $http_request, $useragent_result); my ($EV_REDIR, $reurl, $xmlreurl); $EV_REDIR = "<META[\r\n\cM ]+HTTP-EQUIV=Refresh[\r\n\cM ]+" ."CONTENT=\"[\r\n\cM ]*[0-9]+;[\r\n\cM ]*" ."URL=([^\" ]*)[\r\n\cM ]*\"[\r\n\cM ]*>"; $useragent = new LWP::UserAgent; $useragent->agent("Mozilla/5.0 (compatible; educational project)"); $useragent->timeout(60); # Timeout after 60 seconds $http_request = new HTTP::Request GET => $this_url; $useragent_result = $useragent->request($http_request); if ($useragent_result->is_success) { if ($useragent_result->content =~ /$EV_REDIR/i) { $reurl = $1; $xmlreurl = $reurl; $xmlreurl =~ s/&/&/g; print LOG "\nRedirected to $xmlreurl\n" if ($DEBUG >= 2); return &get_http($reurl); } return $useragent_result->content; } else { print LOG "Could not get $this_url\n"; return ""; } } sub spider_maplandia { my $topurl = shift @_; my $indent = shift @_; # Grab the page my $in = &get_http($topurl); # Preprocess the HTML $in =~ tr/\r\n/ /d; # Compress into single line $in =~ s/\cM//g; # Remove Ctrl-M's $in =~ s/[ ]+/ /g; # Remove redundant spacing $in =~ s/<\/?span[^<>]*>//gi; # Span # Get the list of subplaces if ($in =~ /<div class="rozdel">((?:<ul[^<>]*>|<\/ul>|<li[^<>]*>|<\/li>|<hr[^<>]*>|<a[^<>]*>|<\/a>|[^<>]*)*)<\/div>/gi) { my $blist = $1; foreach my $place ($blist =~ /<li>((?:<a[^<>]*>|<\/a>|[^<>]*)*)<\/li>/gi) { if( $place =~ /<a[^<>]*href="([^" ]*)" [ ]*title="([^"<>]*)"[^<>]*>[^<>]*<\/a>/ ) { my ($surl, $title) = ($1,$2); $title =~ s/\[/(/g; $title =~ s/\]/)/g; $title =~ s/\"/"/g; print $indent." [[".$title."]]\n"; print OUT $indent." [[".$title."]]\n"; sleep 2; &spider_maplandia( $BASEURL.$surl, "*".$indent ); } } } return; }