ScriptingWeb

Web Scraping with Perl scripting

Similar to various powerful modules, Perl comes with various modules for web scraping, that is extracting required information from web HTML pages.

Below is a sample script that I authored to extract the movies information from www.justtollywood.com web site pages.

#! perl
#===============================================================================
# Objective:
# ----------
#
# Perl script to demo the web scraping modules to extract intended information 
# from web pages. 
#
# For this example, I used www.justtollywood.com pages.
#
# $Header: $
#===============================================================================
# Include Modules
#===============================================================================
use strict;
use warnings;
use Pod::Usage;
use File::Basename;
use HTML::TableExtract;
use HTML::TreeBuilder 3;
use Getopt::Long qw(:config no_ignore_case bundling);
#===============================================================================
# Global Variables Declaration
#===============================================================================
use vars qw($DEBUG $SRC_FLDR $DEST_FLDR $HTML_TBL_OBJ $CP_IMAGS $IMGS_FLDR
            %DATAKEYS_MAPS %DATAKEYS %MON2NUM $TREE %MAP_DATAKEYS $OUTPUT_HEADER
            $OUTPUT_CONSOLI);
#===============================================================================
# Prototypes Section
#===============================================================================
sub CopyFile;
sub DoAction;
sub MyReaddir;
sub InitGlobals;
sub ProcessArgs;
sub ProcessFile;
sub PrintResults;
sub Info   {my ($mesg) = @_; print STDOUT "INFO:  $mesg\n";}
sub MyErr  {my ($mesg) = @_; print STDERR "ERROR: $mesg\n";}
sub MyWarn {my ($mesg) = @_; print STDOUT "WARNING: $mesg\n";}
sub MyDie  {my ($mesg) = @_; print STDERR "ERROR: $mesg\n"; exit(1);}
sub Debug  {my ($mesg) = @_; print STDOUT "DEBUG: $mesg\n" if $DEBUG;}
#===============================================================================
# main()
#===============================================================================
{
    InitGlobals();
    ProcessArgs();
    DoAction();
    exit(0);
}
#===============================================================================
# sub InitGlobals
#===============================================================================
sub InitGlobals {
    %MON2NUM = (
                "January"   => 1,
                "February"  => 2,
                "March"     => 3,
                "April"     => 4,
                "May"       => 5,
                "June"      => 6,
                "July"      => 7,
                "August"    => 8,
                "September" => 9,
                "October"   => 10,
                "November"  => 11,
                "December"  => 12
                );
    %DATAKEYS = (
                    "TITLE"            => 1,
                    "CAPTION"          => 2,
                    "BANNER"           => 3,
                    "CENSOR"           => 4,
                    "CAST"             => 5,
                    "CAST-OTHERS"      => 6,
                    "DIALOGUES"        => 7,
                    "MUSIC-DIRECTOR"   => 8,
                    "EDITING"          => 9,
                    "CINEMATOGRAPHY"   => 10,
                    "SCREENPLAY"       => 11,
                    "DIRECTION"        => 12,
                    "PRODUCER"         => 13,
                    "RELEASE-DATE"     => 14,
                    "GENRE"            => 15,
                    "LANGUAGE"         => 16,
                    "IMAGE"            => 17
                );
    %DATAKEYS_MAPS = (
                    "TITLE"           => "TITLE",
                    "CAPTION"         => "CAPTION",
                    "BANNER"          => "BANNER",
                    "CENSOR"          => "CENSOR",
                    "CAST"            => "CAST",
                    "CAST-OTHERS"     => "SUPPORTING CAST",
                    "DIALOGUES"       => "DIALOGUES",
                    "MUSIC-DIRECTOR"  => "MUSIC",
                    "EDITING"         => "EDITING",
                    "CINEMATOGRAPHY"  => "CINEMATOGRAPHER",
                    "SCREENPLAY"      => "SCREENPLAY",
                    "DIRECTION"       => "DIRECTOR",
                    "PRODUCER"        => "PRODUCER",
                    "RELEASE-DATE"    => "RELEASE-DATE",
                    "GENRE"           => "GENRE",
                    "LANGUAGE"        => "LANGUAGE",
                    "IMAGE"           => "IMAGE",
                    );
    
    %MAP_DATAKEYS = (
                    "TITLE"            => "TITLE",
                    "CAPTION"          => "CAPTION",
                    "BANNER"           => "BANNER",
                    "CENSOR"           => "CENSOR",
                    "CAST"             => "CAST",
                    "SUPPORTING CAST"  => "CAST-OTHERS",
                    "DIALOGUES"        => "DIALOGUES",
                    "MUSIC"            => "MUSIC-DIRECTOR",
                    "EDITING"          => "EDITING",
                    "CINEMATOGRAPHER"  => "CINEMATOGRAPHY",
                    "SCREENPLAY"       => "SCREENPLAY",
                    "DIRECTOR"         => "DIRECTION",
                    "PRODUCER"         => "PRODUCER",
                    "RELEASE-DATE"     => "RELEASE-DATE",
                     "GENRE"           => "GENRE",
                    "LANGUAGE"         => "LANGUAGE",
                    "IMAGE"            => "IMAGE"
                    );
    $OUTPUT_HEADER  = "";
    foreach my $key (sort { $DATAKEYS{$a} <=> $DATAKEYS{$b}} keys %DATAKEYS) {
        $OUTPUT_HEADER = ($OUTPUT_HEADER eq "")
                            ? $key : $OUTPUT_HEADER . ", $key";
    }
}
#===============================================================================
# sub ProcessArgs
#===============================================================================
sub ProcessArgs {
    Getopt::Long::Configure("bundling", "no_ignore_case");
    if (! GetOptions('D'   => \$DEBUG,
                     'c'   => \$CP_IMAGS,
                     'o'   => \$OUTPUT_CONSOLI,
                     's=s' => \$SRC_FLDR,
                     'd=s' => \$DEST_FLDR,
                     'h|?' => sub { &pod2usage(-verbose => 2)}) || @ARGV ) {
        pod2usage(2);
    }
    
    MyDie("Specify the folder path of html files to be read") if (! $SRC_FLDR);
    if (! $DEST_FLDR) {
        $DEST_FLDR = "$SRC_FLDR\\output-files";
        if (! -e $DEST_FLDR) {
            if (! mkdir($DEST_FLDR)) {
                MyWarn("Failed to create \"" . $DEST_FLDR . "\" directory: $!" .
                    "Saving Output Files in CWD\n");
            }
        }
    }
    
    if ($CP_IMAGS) {
        $IMGS_FLDR = "$DEST_FLDR\\images";
        if (! -e $IMGS_FLDR) {
            if (! mkdir($IMGS_FLDR)) {
                MyWarn("Failed to create \"" . $IMGS_FLDR . "\" directory: $!" .
                    "WARNING: Skipping Images Copy\n");
                $CP_IMAGS = 0;
            }
        }
    }
    
    if ($OUTPUT_CONSOLI) {
        my $output_file  = "$DEST_FLDR\\Combined-Output-File.csv";
        if (! open(OUT__COMB_FILE_CSV, ">>$output_file")) {
            MyWarn("Failed to open $output_file file: $!");
        }
        print OUT__COMB_FILE_CSV "$OUTPUT_HEADER\n";
    }
}
#===============================================================================
# sub DoAction
#===============================================================================
sub DoAction {
    my %html_files = ();
    MyReaddir($SRC_FLDR, \%html_files);
    
    if (! scalar keys %html_files) {
        Info("There are no files in the specified folder path: [$SRC_FLDR]");
        exit 0;
    }
    
    foreach my $html_file (sort keys %html_files) {
        next if (! -e "$SRC_FLDR\\$html_file");
        Debug("Parsing File: [$html_file]");
        my $href = ProcessFile($html_file);
        PrintResults($html_file, $href);
        Info("Completed processing $html_file");
    }
    
    close OUT__COMB_FILE_CSV;
}
#===============================================================================
# sub MyReaddir
#
# Read the contents of a directory and populate in a hash
#===============================================================================
sub MyReaddir {
    my ($srcdir, $hashref) = @_;
    if (! opendir(DIR, $srcdir)) {
        Info("opendir() failed for $srcdir: $!", 1);
        return;
    }
    Debug("Reading list of files from folder: $srcdir");
    while (defined (my $file = readdir DIR)) {
        next if $file =~ /^\.\.?$/; # Skip . and ..
        next if $file !~ /(.html|.php)$/; # Skip . and ..
        $$hashref{$file} = 0;
    }
    closedir(DIR);
}
#===============================================================================
# sub ProcessFile
#
# 
#===============================================================================
sub ProcessFile {
    my $input_file      = "$SRC_FLDR\\" . shift;
    my (%movies);
    $HTML_TBL_OBJ   = HTML::TableExtract->new( debug => -1,
                                               attribs => {cellspacing => 1,
                                                           cellpadding => 2,
                                                           border      => 0 });
    $TREE           = HTML::TreeBuilder->new();
    $movies{"LANGUAGE"} = "Telugu";
    $TREE->parse_file($input_file);
    my @div_class = $TREE->look_down(_tag => 'div', id => 'topbar');
    foreach my $node (@div_class) {
        my @div_nodes = $node->find_by_tag_name('h2');
        foreach my $sub_div_node (@div_nodes) {
            if (ref $sub_div_node) {
                my $string = $sub_div_node->as_text();
                my @split_strs = split("\xa0", $string);
                if ($#split_strs > 2) {
                    my $title = $split_strs[0];
                    my $caption = $split_strs[2];
                    my $release_date = $split_strs[3];
                    $release_date =~ s/\(|\)//g;
                    $release_date =~ s/,//g;
                    my @dates = split(" ", $release_date);
                    if ($#dates > 1) {
                        my $month = $MON2NUM{$dates[0]};
                        my $day = $dates[1];
                        my $year = $dates[2];
                        my $date_formated = "$month/$day/$year";
                        $movies{"RELEASE-DATE"} = $date_formated;
                    }
                    $movies{"TITLE"} = $title;
                    $movies{"CAPTION"} = $caption;
                }
            }
        }
    }
    my @content_class = $TREE->find_by_attribute("class", "content");
    foreach my $node (@content_class) {
        my @sub_nodes = $node->content_list();
        foreach my $sub_node (@sub_nodes) {
            if (ref $sub_node) {
                my $src_path = $sub_node->attr("src");
                next if (! $src_path);
                $src_path =~ s/\//\\/g;
                my $img_file = "$SRC_FLDR\\$src_path";
                if (! $CP_IMAGS) {
                    if (! exists $movies{"IMAGE"}) {
                        $movies{"IMAGE"} = "$img_file" ;
                    } else {
                        $movies{"IMAGE"} = $movies{"IMAGE"} . "; $img_file";
                    }
                } else {
                    my $new_name = $movies{"TITLE"};
                    CopyFile($img_file, $IMGS_FLDR, $new_name, \%movies);
                }
            }
        }
    }
    my ($key, $value);
    $HTML_TBL_OBJ->parse_file($input_file);
    foreach my $table ($HTML_TBL_OBJ->tables) {
        foreach my $row ($table->rows) {
            my $row_length = 0;
            my $line;
            foreach my $row_data (@$row) {
                if ($row_data) {
                    # remove new line characters
                    $row_data =~ s/\n//g;
                    # print "Skipping: matched special chars: [$row_data]\n";
                    next if ($row_data =~ /^\W+/ig);
                    $row_length++;
                    next if ($row_length > 1);
                    $row_data =~ s/(.*)\(.*/$1/g;
                    $line .= "$row_data";
                }
            }
            
            if ($row_length <= 1) {
                $key = uc ($line);
                next;
            }
            next if ($key && ! exists $MAP_DATAKEYS{"$key"});
            if (! exists $movies{$key}) {
                $movies{$key} = "$line" ;
            } else {
                $movies{$key} = $movies{$key} . "; $line" ;
            }
        }
    }
    
    return (\%movies);
}
#===============================================================================
# sub PrintResults
#
# 
#===============================================================================
sub PrintResults {
    my ($input_file, $href) = @_;
    my $title = $$href{"TITLE"};
    my $output_file     = "$DEST_FLDR\\$title.csv";
    Debug("\t\t[Printing Results Hash]");
    my $str = "";
    foreach my $key (sort { $DATAKEYS{$a} <=> $DATAKEYS{$b}} keys %DATAKEYS) {
        my $value = (exists $$href{$DATAKEYS_MAPS{$key}}) ?
                        $$href{$DATAKEYS_MAPS{$key}} : "-";
        my $dbg_str = sprintf "%-30s %s\n", $key, $value if ($DEBUG);
        Debug($dbg_str);
        $str = ($str eq "") ? $value : $str . ", $value";
    }
    if (! open(OUT_FILE_CSV, ">>$output_file")) {
        MyWarn("Failed to open $output_file file: $!");
        return 0;
    }
    print OUT_FILE_CSV "$OUTPUT_HEADER\n$str\n";
    close OUT_FILE_CSV;
    
    print OUT__COMB_FILE_CSV "$str\n" if ($OUTPUT_CONSOLI);
}
#===============================================================================
# sub CopyFile
#===============================================================================
sub CopyFile {
    my ($src_file, $dst_dir, $new_name, $href) = @_;
    # $src_file =~ s/\\\\/\\/g;
    if (! -e $src_file) {
        MyErr("$src_file file does not exist");
        return 0;
    }
    my $src_extn = $src_file;
    $src_extn =~ s/.*\.(.*)$/$1/g;
    my $new_file_name = "$dst_dir\\$new_name.$src_extn";
    my $cmd = "copy \"$src_file\" \"$new_file_name\"";
    #Debug("Running command: $cmd");
    `$cmd`;
    if ($?) {
        MyWarn("Failed to copy $src_file file to $new_file_name");
        return 0;
    } else {
        Debug("Successfully copied $src_file file to $new_file_name");
        $$href{"IMAGE"} = "\\images\\$new_name.$src_extn";
        return 1;
    }
}
#===============================================================================
# documentation
#===============================================================================
__END__
=head1 NAME
B<.pl> - .
=head1 SYNOPSIS
.pl [- <>][-h] [-D]
=head1 OPTIONS
=over 4
=item B<-D>
Run in debug mode.
=item B<-> <>
=item B<-h>
Print this help message.
=back

Output:

Creates a csv file for each movie with extracted information
C:\>dir /os C:\www.justtollywood.com\output-files\ | head -15
 Volume in drive C has no label.
 Volume Serial Number is 3A1F-BF27
 Directory of C:\www.justtollywood.com\output-files
01/21/2011  02:28 AM    <DIR>          .
01/21/2011  02:28 AM    <DIR>          ..
01/21/2011  02:28 AM    <DIR>          images
01/21/2011  02:26 AM               234 Lavkus.csv
01/21/2011  02:28 AM               242 Layam.csv
01/21/2011  02:28 AM               243 Bullama Bullodu.csv
01/21/2011  02:27 AM               245 Bay.csv
01/21/2011  02:28 AM               246 Love College.csv
01/21/2011  02:28 AM               246 Harror No.1.csv
01/21/2011  02:25 AM               246 Hanthakudu.csv
C:\>type C:\www.justtollywood.com\output-files\Lavakusa.csv
TITLE, CAPTION, BANNER, CENSOR, CAST, CAST-OTHERS, DIALOGUES, MUSIC-DIRECTOR, EDITING, CINEMATOGRAPHY, SCREENPLAY, DIRECTION, PRODUCER, RELE
ASE-DATE, GENRE, LANGUAGE, IMAGE
Lavakusa, , Lalitha Siva Jyothi Films , -, Anjali Devi; N T RamaRao; Chittor V Nagaiah; Relangi; Kaikala Satyanarayana; Shoban Babu; Suryaka
ntham , Ramana Reddy; Kantha Rao; Janardhana Rao Arja; Kannambha; S Varalakshmi; Master Nagaraju, Vempati Sadasiva Brahmam, Ghantasala, A Sa
njeevi , P L Roy, C Pullaiah; C S Rao, C Pullaiah; C S Rao, A Sankar Reddy, 3/29/1963, -, Telugu, -
TITLE, CAPTION, BANNER, CENSOR, CAST, CAST-OTHERS, DIALOGUES, MUSIC-DIRECTOR, EDITING, CINEMATOGRAPHY, SCREENPLAY, DIRECTION, PRODUCER, RELE
ASE-DATE, GENRE, LANGUAGE, IMAGE
Lavakusa, , East India Films, -, Parupalli Subba Rao; Sri Ranjani; Chittor V Nagaiah, -, -, -, -, -, -, C Pullaiah, Bajaranjalal Kemka, -, -
, Telugu, -
C:\>
Creates a Images subfolder with all images files of movies.
C:\>dir /os C:\www.justtollywood.com\output-files\images | head -15
 Volume in drive C has no label.
 Volume Serial Number is 3A1F-BF27
 Directory of C:\www.justtollywood.com\output-files\images
01/21/2011  02:28 AM    <DIR>          ..
01/21/2011  02:28 AM    <DIR>          .
08/18/2010  01:12 PM             1,221 Bakta Pothana.jpg
08/17/2010  03:16 PM             1,221 Alludugaru.jpg
08/16/2010  05:06 PM             1,221 Aadabrathuku.jpg
08/18/2010  01:23 PM             1,221 Bale Ammaiyulu.jpg
08/17/2010  03:45 PM             1,221 Andaru Dongale.jpg
07/04/2009  06:58 PM             3,107 Anasuyamma Gari Alludu.jpg
07/04/2009  07:20 PM             4,162 Akarshana.jpg
07/04/2009  06:56 PM             4,206 Aalu Magalu.jpeg
C:\>
Creates a consolidated .csv file with needed details of all the movies.
C:\>dir C:\www.justtollywood.com\output-files\*combine*
 Volume in drive C has no label.
 Volume Serial Number is 3A1F-BF27
 Directory of C:\www.justtollywood.com\output-files
01/21/2011  02:28 AM           372,348 Combined-Output-File.csv
               1 File(s)        372,348 bytes
               0 Dir(s)  45,088,911,360 bytes free
C:\>head -15 "C:\www.justtollywood.com\output-files\Combined-Output-File.csv"
TITLE, CAPTION, BANNER, CENSOR, CAST, CAST-OTHERS, DIALOGUES, MUSIC-DIRECTOR, EDITING, CINEMATOGRAPHY, SCREENPLAY, DIRECTION, PRODUCER, RELE
ASE-DATE, GENRE, LANGUAGE, IMAGE
Chinthamani, , Madhan Theathers, -, Ramathilakam; Pulipati Venkateswarlu, -, -, -, -, -, -, Kalakuri Sadasiva Rao, Madhan Theathers, -, -, T
elugu, \images\Chinthamani.gif
Avvarini Nammmali, , Natarajan Pictures, -, Harinath; Rajasri, -, -, -, -, -, -, -, -, -, -, Telugu, \images\Avvarini Nammmali.gif
Panchakshari, , Sai Ratna Creations, -, Anushka; Samrat Reddy, Nasser; Chandra Mohan; Brahmanandam ; Pradeep Rawat; Ali; Raghu Babu; M S Nar
ayana; Uttej; Sakunthala; Sana; Jayavani; Jeeva; Benarjee; Ravi Prakash; Telangana Sakuntala; Keerthi , Thota Prasad, Chinna , Marthand K Ve
nkatesh , Vasu , -, V Samudra, Bommadevara Ramachandra Rao, 6/11/2010, -, Telugu, -
Laila Majnu, , Bharani Pictures, -, Bhanumathi Ramakrishna; Akkineni Nageswara Rao, Seeta Rama Anjaneyulu Chilakalpudi; Sri Ranjani; Mukkamu
la; Lalitha Dubey; Kasturi Siva Rao; Padmini; Arrani Satyanarayana; Hemalatha, Samudrala Raghavacharya , C R Subbaraman, -, B S Ranga, Samud
rala Raghavacharya , P S Ramakrishna Rao, Bhanumathi Ramakrishna; P S Ramakrishna Rao, 10/1/1949, -, Telugu, -
Drohi, , Swatahantra, -, K S Prakash Rao; G Varalakshmi; L V Prasad, -, -, -, -, -, -, L V Prasad, K S Prakash Rao, -, -, Telugu, \images\Dr
ohi.jpg
Atha Mechina Aluudu, , Sri Lalitha Kalanjali, -, Krishna ; JayaPradha, -, -, K V Mahadevan, -, -, -, Kodi Ramakrishna, Vakada Appa Rao, 1/19
/1989, -, Telugu, \images\Atha Mechina Aluudu.gif
Daggubati Rana & Suresh Babu New Movie, , Suresh Productions Pvt. Ltd. , -, Rana Daggubati , -, -, -, -, -, -, Anand Ranga, Daggubati Suresh
 Babu , -, -, Telugu, \images\Daggubati Rana & Suresh Babu New Movie.gif
Anveshana, , RamKumar Productions, -, Bhanu Priya; Karthik, Kaikala Satyanarayana; Rallapalli; Sharath Babu; Subhalekha Sudhakar ; M Mallika
rjuna Rao; Y Vijaya; K Viswanatham; Balaji, -, Ilayaraja, G R Anil Malnad, M V Raghu, -, Vamsi, Kamineni Prasad, -, -, Telugu, -
Chasastapu Mogudu, , Rajyalakshmi Cine Arts, -, Suman; Bhanu Priya, -, -, -, -, -, -, Sarath, Midha Rama Rao, -, -, Telugu, \images\Chasasta
pu Mogudu.gif
Dabbu Bale Jabbu, , Allu Arts, -, Raj Kumar; SumaLatha, Brahmanandam ; Hema, -, -, -, -, -, K S Rajendra, Allu Ramalingaiah , -, -, Telugu,
\images\Dabbu Bale Jabbu.jpg
Bharateeyudu, , Sri Surya Movies, -, Kamal Hassan; Manisha Koirala; Urmila, Kasthuri ; Sukanya; Goudarmani; Manorama; Nedumudi Venu, Sujatha
 , A R Rahman, V T Vijayan, Jeeva Shankar, Shankar , Shankar , A M Rathnam, 8/23/1996, -, Telugu, \images\Bharateeyudu.jpg
Lakshmi Kataksham, , P S R, -, N T RamaRao; K R Vijaya; Rajasri, Kaikala Satyanarayana; Prabhakar Reddy; Balayya; Mikkilineni; Hemalatha, Ch
illara Bhavan Narayana Rao, S P Kodandapani, K Govinda Swamy, H S Venu, B Vitala charya, B Vitala charya, Pinjala Subba Rao, 3/12/1970, -, T
elugu, -
Bhakta Kabir, , Chamrima Takies, -, Seeta Rama Anjaneyulu Chilakalpudi, -, -, -, -, -, -, Rai Mothilal Chamriya; Chitrapu Narayana Rao, Rai
Mothilal Chamriya, -, -, Telugu, \images\Bhakta Kabir.gif
Attinti Sapam, , Modern Theaters, -, Saroja, -, -, -, -, -, -, Acharaya M Mastan, T R Sundaram, -, -, Telugu, \images\Attinti Sapam.gif
C:\>

Advertisements

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.