From e116955e5c08414d92ed19a8d055f83f45dafcfd Mon Sep 17 00:00:00 2001 From: Gavan Fantom Date: Wed, 9 Feb 2011 17:56:12 +0000 Subject: [PATCH] Initial checkin of orangelogin.pl, a script to scrape the Orange website and download PDF bills. --- orangelogin.pl | 145 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100755 orangelogin.pl diff --git a/orangelogin.pl b/orangelogin.pl new file mode 100755 index 0000000..dd5d901 --- /dev/null +++ b/orangelogin.pl @@ -0,0 +1,145 @@ +#!/usr/pkg/bin/perl + +use WWW::Mechanize; +local $mech = WWW::Mechanize->new(); + +my $configfile = $ARGV[0]; + +error("Usage: $0 configfile") unless defined($configfile); + +open CONFIG, "<$configfile" or die $!; + +while () +{ + chomp; + # no newline + s/#.*//; + # no comments + s/^\s+//; + # no leading white + s/\s+$//; + # no trailing white + next unless length; + # anything left? + my ($var, $value) = split(/\s*=\s*/, $_, 2); + $options{$var} = $value; +} + +close CONFIG; + +$username = $options{'username'}; +die "username not set" unless defined($username); + +$password = $options{'password'}; +die "password not set" unless defined($password); + +$downloaddir = $options{'downloaddir'}; +die "downloaddir not set" unless defined($downloaddir); + +print "Fetching homepage\n"; +$mech->get("https://www.orange.co.uk/"); + +$mech->follow_link( text_regex => qr/mobile account/ ); + +print "Logging in\n"; + +$mech->submit_form( + form_number => 1, + fields => { + txtMSISDN => $username, + txtPassword => $password, + } +); + +# Stupid JavaScript here to submit the form on loading of the page. +# Just log in. + +print "Following intermediate login form\n"; + +$mech->submit_form( + form_number => 1, + fields => { + } +); + +print "Following view your bills link\n"; + +$mech->follow_link( text_regex => qr/view your bills/ ); + +follow_processing(); + +my @links = $mech->find_all_links( text_regex => qr/download PDF/ ); +for $link (@links) { + my $url = $link->url(); + my ($invoice) = $url =~ /leg_invoice=(\d+)/; + my $filename = $downloaddir . '/' . $invoice . ".pdf"; + unless (-e $filename) { +# print "Downloading invoice $invoice to $filename from $url\n"; + print "Downloading invoice $invoice to $filename\n"; + downloadbill($url, $filename); + #$mech->mirror($url, $filename); + } else { + print "Skipping download of invoice $invoice\n"; + } +} + +#$mech->follow_link( text_regex => qr/log out/ ); + +#print $mech->content(); + +#$mech->dump_forms( undef, $absolute ); +#$mech->dump_links( undef, $absolute ); + +sub follow_processing { + my $url, $attempts; + + $attempts = 0; + + while ($attempts < 10) { + $attempts++; + ($url) = $mech->content() =~ /var sURL = "(.*)";/; + + return unless defined($url); + + sleep(1); + + $url =~ s/&/&/g; + +# print "Attempt $attempts. Found URL: $url\n"; + print "Waiting for processing. Attempt $attempts.\n"; + + my $newurl = $mech->uri(); + + $newurl =~ s/\/[^\/]*$//; + $newurl = $newurl . "/" . $url; + +# print "I think I'm going to $newurl\n"; + + $mech->get($newurl); + print "Processing finished.\n"; + } +} + +sub downloadbill { + my ($url, $filename) = @_; + + print "Following download link\n"; + $mech->get($url); + follow_processing(); + print "Following View your bill link\n"; + my $billlink = $mech->find_link( text_regex => qr/View your bill/ ); + if (!defined($billlink)) { + print "Unable to download bill\n"; + return; + } + my $billurl = $billlink->url_abs(); + print "Downloading bill\n"; + $mech->mirror($billurl, $filename); +} + +sub error { + my ($error) = @_; + print STDERR $error; + print STDERR "\n"; + exit 1; +}