#!/usr/bin/perl -w use strict; use LWP::UserAgent; my $data = '/html/jobs/post-gazette-listings.html'; my %oldjobs = (); open (JOBS, "<$data") || print "error opening $data\n"; my $olddata = ""; if (open (JOBS, "+<$data")) { read JOBS, $olddata, 500000; my @oj = ($olddata =~ /details.html\?id=([0-9a-f]*)/sg); print "number of old jobs found is ", scalar(@oj), "\n"; map {$oldjobs{$_} = 'Y'} @oj; } my $ua = LWP::UserAgent->new; my @newjobs = (); my $next = ""; my $page; do{ my $urlbase = "http://pgh.careercast.com"; my $url; if ($next) { $url = $urlbase . $next; } else { my $urlrt = 'js.php?lookid=pgh&qSort=date&qState=PA&view=2&pp=100'; my $urlacc = 'qInd=pghcategoryaccounting'; my $urladm = 'qInd=pghcategoryadministrative'; my $urlcom = 'qInd=pghcategorycomputer'; my $urledu = 'qInd=pghcategoryeducation'; my $urlgen = 'qInd=pghcategorygeneral'; my $urlpro = 'qInd=pghcategoryprofessional'; my $urlhos = 'qInd=pghcategoryhospitality'; $url = "$urlbase/$urlrt&$urlgen&$urlacc&$urladm&$urlcom&$urledu&$urlpro&$urlhos"; } $| = 1; print " fetching page..."; my $res = $ua->get($url); print "got it\n"; $page = $res->content; # open NJ, "]*>/, $page; my $page2; foreach my $p (@pageparts) { if ($p =~ />Job Title/ , $page2; print " Number of rows is ", scalar(@rows), "\n"; foreach my $row (@rows) { # sleep 1; # my ($id, $desc, $date) = # ($row=~ /details.html\?id=([^&]*)[^>]*>([^<]*).*([0-9]*-[0-9]*-[0-9]*)/s); # print " row is |$row|\n"; my ($id, $desc) = ($row=~ /details.html\?id=([^&]*)[^>]*>([^<]*)/s); if (!$id or !$desc) { next; } # print " id is |$id| and desc is |$desc|\n"; push @newjobs, [$id, $desc] unless (exists($oldjobs{$id})); } print " found ", scalar(@newjobs), " new jobs so far\n"; } while (($next) = ($page =~ /]*but.arrowNext.gif/)); print ("number of new jobs found is ", scalar(@newjobs), "\n"); my (undef, undef, undef, $day, $mon, $year) = localtime; $mon = (qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec))[$mon]; $year +=1900; print "Date is $mon-$day-$year\n"; system "cp $data $data.bak"; open (JOBS, ">$data"); foreach my $job (@newjobs) { print JOBS '
', $$job[1], " "; print JOBS "$mon-$day-$year\n"; } print JOBS $olddata;