Web Scraping with Perl scripting

Similar to various powerful modules, Perl comes with various modules for web scraping, that is extracting required information from web HTML pages.

Below is a sample script that I authored to extract the movies information from www.justtollywood.com web site pages.

[code language=”perl”]
#! perl
#===============================================================================
# Objective:
# ———-
#
# Perl script to demo the web scraping modules to extract intended information
# from web pages.
#
# For this example, I used www.justtollywood.com pages.
#
# $Header: $
#===============================================================================
# Include Modules
#===============================================================================
use strict;
use warnings;
use Pod::Usage;
use File::Basename;
use HTML::TableExtract;
use HTML::TreeBuilder 3;
use Getopt::Long qw(:config no_ignore_case bundling);
#===============================================================================
# Global Variables Declaration
#===============================================================================
use vars qw($DEBUG $SRC_FLDR $DEST_FLDR $HTML_TBL_OBJ $CP_IMAGS $IMGS_FLDR
%DATAKEYS_MAPS %DATAKEYS %MON2NUM $TREE %MAP_DATAKEYS $OUTPUT_HEADER
$OUTPUT_CONSOLI);
#===============================================================================
# Prototypes Section
#===============================================================================
sub CopyFile;
sub DoAction;
sub MyReaddir;
sub InitGlobals;
sub ProcessArgs;
sub ProcessFile;
sub PrintResults;
sub Info {my ($mesg) = @_; print STDOUT "INFO: $mesg\n";}
sub MyErr {my ($mesg) = @_; print STDERR "ERROR: $mesg\n";}
sub MyWarn {my ($mesg) = @_; print STDOUT "WARNING: $mesg\n";}
sub MyDie {my ($mesg) = @_; print STDERR "ERROR: $mesg\n"; exit(1);}
sub Debug {my ($mesg) = @_; print STDOUT "DEBUG: $mesg\n" if $DEBUG;}
#===============================================================================
# main()
#===============================================================================
{
InitGlobals();
ProcessArgs();
DoAction();
exit(0);
}
#===============================================================================
# sub InitGlobals
#===============================================================================
sub InitGlobals {
%MON2NUM = (
"January" => 1,
"February" => 2,
"March" => 3,
"April" => 4,
"May" => 5,
"June" => 6,
"July" => 7,
"August" => 8,
"September" => 9,
"October" => 10,
"November" => 11,
"December" => 12
);
%DATAKEYS = (
"TITLE" => 1,
"CAPTION" => 2,
"BANNER" => 3,
"CENSOR" => 4,
"CAST" => 5,
"CAST-OTHERS" => 6,
"DIALOGUES" => 7,
"MUSIC-DIRECTOR" => 8,
"EDITING" => 9,
"CINEMATOGRAPHY" => 10,
"SCREENPLAY" => 11,
"DIRECTION" => 12,
"PRODUCER" => 13,
"RELEASE-DATE" => 14,
"GENRE" => 15,
"LANGUAGE" => 16,
"IMAGE" => 17
);
%DATAKEYS_MAPS = (
"TITLE" => "TITLE",
"CAPTION" => "CAPTION",
"BANNER" => "BANNER",
"CENSOR" => "CENSOR",
"CAST" => "CAST",
"CAST-OTHERS" => "SUPPORTING CAST",
"DIALOGUES" => "DIALOGUES",
"MUSIC-DIRECTOR" => "MUSIC",
"EDITING" => "EDITING",
"CINEMATOGRAPHY" => "CINEMATOGRAPHER",
"SCREENPLAY" => "SCREENPLAY",
"DIRECTION" => "DIRECTOR",
"PRODUCER" => "PRODUCER",
"RELEASE-DATE" => "RELEASE-DATE",
"GENRE" => "GENRE",
"LANGUAGE" => "LANGUAGE",
"IMAGE" => "IMAGE",
);

%MAP_DATAKEYS = (
"TITLE" => "TITLE",
"CAPTION" => "CAPTION",
"BANNER" => "BANNER",
"CENSOR" => "CENSOR",
"CAST" => "CAST",
"SUPPORTING CAST" => "CAST-OTHERS",
"DIALOGUES" => "DIALOGUES",
"MUSIC" => "MUSIC-DIRECTOR",
"EDITING" => "EDITING",
"CINEMATOGRAPHER" => "CINEMATOGRAPHY",
"SCREENPLAY" => "SCREENPLAY",
"DIRECTOR" => "DIRECTION",
"PRODUCER" => "PRODUCER",
"RELEASE-DATE" => "RELEASE-DATE",
"GENRE" => "GENRE",
"LANGUAGE" => "LANGUAGE",
"IMAGE" => "IMAGE"
);
$OUTPUT_HEADER = "";
foreach my $key (sort { $DATAKEYS{$a} <=> $DATAKEYS{$b}} keys %DATAKEYS) {
$OUTPUT_HEADER = ($OUTPUT_HEADER eq "")
? $key : $OUTPUT_HEADER . ", $key";
}
}
#===============================================================================
# sub ProcessArgs
#===============================================================================
sub ProcessArgs {
Getopt::Long::Configure("bundling", "no_ignore_case");
if (! GetOptions(‘D’ => \$DEBUG,
‘c’ => \$CP_IMAGS,
‘o’ => \$OUTPUT_CONSOLI,
‘s=s’ => \$SRC_FLDR,
‘d=s’ => \$DEST_FLDR,
‘h|?’ => sub { &pod2usage(-verbose => 2)}) || @ARGV ) {
pod2usage(2);
}

MyDie("Specify the folder path of html files to be read") if (! $SRC_FLDR);
if (! $DEST_FLDR) {
$DEST_FLDR = "$SRC_FLDR\\output-files";
if (! -e $DEST_FLDR) {
if (! mkdir($DEST_FLDR)) {
MyWarn("Failed to create \"" . $DEST_FLDR . "\" directory: $!" .
"Saving Output Files in CWD\n");
}
}
}

if ($CP_IMAGS) {
$IMGS_FLDR = "$DEST_FLDR\\images";
if (! -e $IMGS_FLDR) {
if (! mkdir($IMGS_FLDR)) {
MyWarn("Failed to create \"" . $IMGS_FLDR . "\" directory: $!" .
"WARNING: Skipping Images Copy\n");
$CP_IMAGS = 0;
}
}
}

if ($OUTPUT_CONSOLI) {
my $output_file = "$DEST_FLDR\\Combined-Output-File.csv";
if (! open(OUT__COMB_FILE_CSV, ">>$output_file")) {
MyWarn("Failed to open $output_file file: $!");
}
print OUT__COMB_FILE_CSV "$OUTPUT_HEADER\n";
}
}
#===============================================================================
# sub DoAction
#===============================================================================
sub DoAction {
my %html_files = ();
MyReaddir($SRC_FLDR, \%html_files);

if (! scalar keys %html_files) {
Info("There are no files in the specified folder path: [$SRC_FLDR]");
exit 0;
}

foreach my $html_file (sort keys %html_files) {
next if (! -e "$SRC_FLDR\\$html_file");
Debug("Parsing File: [$html_file]");
my $href = ProcessFile($html_file);
PrintResults($html_file, $href);
Info("Completed processing $html_file");
}

close OUT__COMB_FILE_CSV;
}
#===============================================================================
# sub MyReaddir
#
# Read the contents of a directory and populate in a hash
#===============================================================================
sub MyReaddir {
my ($srcdir, $hashref) = @_;
if (! opendir(DIR, $srcdir)) {
Info("opendir() failed for $srcdir: $!", 1);
return;
}
Debug("Reading list of files from folder: $srcdir");
while (defined (my $file = readdir DIR)) {
next if $file =~ /^\.\.?$/; # Skip . and ..
next if $file !~ /(.html|.php)$/; # Skip . and ..
$$hashref{$file} = 0;
}
closedir(DIR);
}
#===============================================================================
# sub ProcessFile
#
#
#===============================================================================
sub ProcessFile {
my $input_file = "$SRC_FLDR\\" . shift;
my (%movies);
$HTML_TBL_OBJ = HTML::TableExtract->new( debug => -1,
attribs => {cellspacing => 1,
cellpadding => 2,
border => 0 });
$TREE = HTML::TreeBuilder->new();
$movies{"LANGUAGE"} = "Telugu";
$TREE->parse_file($input_file);
my @div_class = $TREE->look_down(_tag => ‘div’, id => ‘topbar’);
foreach my $node (@div_class) {
my @div_nodes = $node->find_by_tag_name(‘h2’);
foreach my $sub_div_node (@div_nodes) {
if (ref $sub_div_node) {
my $string = $sub_div_node->as_text();
my @split_strs = split("\xa0", $string);
if ($#split_strs > 2) {
my $title = $split_strs[0];
my $caption = $split_strs[2];
my $release_date = $split_strs[3];
$release_date =~ s/\(|\)//g;
$release_date =~ s/,//g;
my @dates = split(" ", $release_date);
if ($#dates > 1) {
my $month = $MON2NUM{$dates[0]};
my $day = $dates[1];
my $year = $dates[2];
my $date_formated = "$month/$day/$year";
$movies{"RELEASE-DATE"} = $date_formated;
}
$movies{"TITLE"} = $title;
$movies{"CAPTION"} = $caption;
}
}
}
}
my @content_class = $TREE->find_by_attribute("class", "content");
foreach my $node (@content_class) {
my @sub_nodes = $node->content_list();
foreach my $sub_node (@sub_nodes) {
if (ref $sub_node) {
my $src_path = $sub_node->attr("src");
next if (! $src_path);
$src_path =~ s/\//\\/g;
my $img_file = "$SRC_FLDR\\$src_path";
if (! $CP_IMAGS) {
if (! exists $movies{"IMAGE"}) {
$movies{"IMAGE"} = "$img_file" ;
} else {
$movies{"IMAGE"} = $movies{"IMAGE"} . "; $img_file";
}
} else {
my $new_name = $movies{"TITLE"};
CopyFile($img_file, $IMGS_FLDR, $new_name, \%movies);
}
}
}
}
my ($key, $value);
$HTML_TBL_OBJ->parse_file($input_file);
foreach my $table ($HTML_TBL_OBJ->tables) {
foreach my $row ($table->rows) {
my $row_length = 0;
my $line;
foreach my $row_data (@$row) {
if ($row_data) {
# remove new line characters
$row_data =~ s/\n//g;
# print "Skipping: matched special chars: [$row_data]\n";
next if ($row_data =~ /^\W+/ig);
$row_length++;
next if ($row_length > 1);
$row_data =~ s/(.*)\(.*/$1/g;
$line .= "$row_data";
}
}

if ($row_length <= 1) {
$key = uc ($line);
next;
}
next if ($key && ! exists $MAP_DATAKEYS{"$key"});
if (! exists $movies{$key}) {
$movies{$key} = "$line" ;
} else {
$movies{$key} = $movies{$key} . "; $line" ;
}
}
}

return (\%movies);
}
#===============================================================================
# sub PrintResults
#
#
#===============================================================================
sub PrintResults {
my ($input_file, $href) = @_;
my $title = $$href{"TITLE"};
my $output_file = "$DEST_FLDR\\$title.csv";
Debug("\t\t[Printing Results Hash]");
my $str = "";
foreach my $key (sort { $DATAKEYS{$a} <=> $DATAKEYS{$b}} keys %DATAKEYS) {
my $value = (exists $$href{$DATAKEYS_MAPS{$key}}) ?
$$href{$DATAKEYS_MAPS{$key}} : "-";
my $dbg_str = sprintf "%-30s %s\n", $key, $value if ($DEBUG);
Debug($dbg_str);
$str = ($str eq "") ? $value : $str . ", $value";
}
if (! open(OUT_FILE_CSV, ">>$output_file")) {
MyWarn("Failed to open $output_file file: $!");
return 0;
}
print OUT_FILE_CSV "$OUTPUT_HEADER\n$str\n";
close OUT_FILE_CSV;

print OUT__COMB_FILE_CSV "$str\n" if ($OUTPUT_CONSOLI);
}
#===============================================================================
# sub CopyFile
#===============================================================================
sub CopyFile {
my ($src_file, $dst_dir, $new_name, $href) = @_;
# $src_file =~ s/\\\\/\\/g;
if (! -e $src_file) {
MyErr("$src_file file does not exist");
return 0;
}
my $src_extn = $src_file;
$src_extn =~ s/.*\.(.*)$/$1/g;
my $new_file_name = "$dst_dir\\$new_name.$src_extn";
my $cmd = "copy \"$src_file\" \"$new_file_name\"";
#Debug("Running command: $cmd");
`$cmd`;
if ($?) {
MyWarn("Failed to copy $src_file file to $new_file_name");
return 0;
} else {
Debug("Successfully copied $src_file file to $new_file_name");
$$href{"IMAGE"} = "\\images\\$new_name.$src_extn";
return 1;
}
}
#===============================================================================
# documentation
#===============================================================================
__END__
=head1 NAME
B<.pl> – .
=head1 SYNOPSIS
.pl [- <>][-h] [-D]
=head1 OPTIONS
=over 4
=item B<-D>
Run in debug mode.
=item B<-> <>
=item B<-h>
Print this help message.
=back

[/code]

Output:

[code language=”text”]
Creates a csv file for each movie with extracted information
C:\>dir /os C:\www.justtollywood.com\output-files\ | head -15
Volume in drive C has no label.
Volume Serial Number is 3A1F-BF27
Directory of C:\www.justtollywood.com\output-files
01/21/2011 02:28 AM <DIR> .
01/21/2011 02:28 AM <DIR> ..
01/21/2011 02:28 AM <DIR> images
01/21/2011 02:26 AM 234 Lavkus.csv
01/21/2011 02:28 AM 242 Layam.csv
01/21/2011 02:28 AM 243 Bullama Bullodu.csv
01/21/2011 02:27 AM 245 Bay.csv
01/21/2011 02:28 AM 246 Love College.csv
01/21/2011 02:28 AM 246 Harror No.1.csv
01/21/2011 02:25 AM 246 Hanthakudu.csv
C:\>type C:\www.justtollywood.com\output-files\Lavakusa.csv
TITLE, CAPTION, BANNER, CENSOR, CAST, CAST-OTHERS, DIALOGUES, MUSIC-DIRECTOR, EDITING, CINEMATOGRAPHY, SCREENPLAY, DIRECTION, PRODUCER, RELE
ASE-DATE, GENRE, LANGUAGE, IMAGE
Lavakusa, , Lalitha Siva Jyothi Films , -, Anjali Devi; N T RamaRao; Chittor V Nagaiah; Relangi; Kaikala Satyanarayana; Shoban Babu; Suryaka
ntham , Ramana Reddy; Kantha Rao; Janardhana Rao Arja; Kannambha; S Varalakshmi; Master Nagaraju, Vempati Sadasiva Brahmam, Ghantasala, A Sa
njeevi , P L Roy, C Pullaiah; C S Rao, C Pullaiah; C S Rao, A Sankar Reddy, 3/29/1963, -, Telugu, –
TITLE, CAPTION, BANNER, CENSOR, CAST, CAST-OTHERS, DIALOGUES, MUSIC-DIRECTOR, EDITING, CINEMATOGRAPHY, SCREENPLAY, DIRECTION, PRODUCER, RELE
ASE-DATE, GENRE, LANGUAGE, IMAGE
Lavakusa, , East India Films, -, Parupalli Subba Rao; Sri Ranjani; Chittor V Nagaiah, -, -, -, -, -, -, C Pullaiah, Bajaranjalal Kemka, -, –
, Telugu, –
C:\>
Creates a Images subfolder with all images files of movies.
C:\>dir /os C:\www.justtollywood.com\output-files\images | head -15
Volume in drive C has no label.
Volume Serial Number is 3A1F-BF27
Directory of C:\www.justtollywood.com\output-files\images
01/21/2011 02:28 AM <DIR> ..
01/21/2011 02:28 AM <DIR> .
08/18/2010 01:12 PM 1,221 Bakta Pothana.jpg
08/17/2010 03:16 PM 1,221 Alludugaru.jpg
08/16/2010 05:06 PM 1,221 Aadabrathuku.jpg
08/18/2010 01:23 PM 1,221 Bale Ammaiyulu.jpg
08/17/2010 03:45 PM 1,221 Andaru Dongale.jpg
07/04/2009 06:58 PM 3,107 Anasuyamma Gari Alludu.jpg
07/04/2009 07:20 PM 4,162 Akarshana.jpg
07/04/2009 06:56 PM 4,206 Aalu Magalu.jpeg
C:\>
Creates a consolidated .csv file with needed details of all the movies.
C:\>dir C:\www.justtollywood.com\output-files\*combine*
Volume in drive C has no label.
Volume Serial Number is 3A1F-BF27
Directory of C:\www.justtollywood.com\output-files
01/21/2011 02:28 AM 372,348 Combined-Output-File.csv
1 File(s) 372,348 bytes
0 Dir(s) 45,088,911,360 bytes free
C:\>head -15 "C:\www.justtollywood.com\output-files\Combined-Output-File.csv"
TITLE, CAPTION, BANNER, CENSOR, CAST, CAST-OTHERS, DIALOGUES, MUSIC-DIRECTOR, EDITING, CINEMATOGRAPHY, SCREENPLAY, DIRECTION, PRODUCER, RELE
ASE-DATE, GENRE, LANGUAGE, IMAGE
Chinthamani, , Madhan Theathers, -, Ramathilakam; Pulipati Venkateswarlu, -, -, -, -, -, -, Kalakuri Sadasiva Rao, Madhan Theathers, -, -, T
elugu, \images\Chinthamani.gif
Avvarini Nammmali, , Natarajan Pictures, -, Harinath; Rajasri, -, -, -, -, -, -, -, -, -, -, Telugu, \images\Avvarini Nammmali.gif
Panchakshari, , Sai Ratna Creations, -, Anushka; Samrat Reddy, Nasser; Chandra Mohan; Brahmanandam ; Pradeep Rawat; Ali; Raghu Babu; M S Nar
ayana; Uttej; Sakunthala; Sana; Jayavani; Jeeva; Benarjee; Ravi Prakash; Telangana Sakuntala; Keerthi , Thota Prasad, Chinna , Marthand K Ve
nkatesh , Vasu , -, V Samudra, Bommadevara Ramachandra Rao, 6/11/2010, -, Telugu, –
Laila Majnu, , Bharani Pictures, -, Bhanumathi Ramakrishna; Akkineni Nageswara Rao, Seeta Rama Anjaneyulu Chilakalpudi; Sri Ranjani; Mukkamu
la; Lalitha Dubey; Kasturi Siva Rao; Padmini; Arrani Satyanarayana; Hemalatha, Samudrala Raghavacharya , C R Subbaraman, -, B S Ranga, Samud
rala Raghavacharya , P S Ramakrishna Rao, Bhanumathi Ramakrishna; P S Ramakrishna Rao, 10/1/1949, -, Telugu, –
Drohi, , Swatahantra, -, K S Prakash Rao; G Varalakshmi; L V Prasad, -, -, -, -, -, -, L V Prasad, K S Prakash Rao, -, -, Telugu, \images\Dr
ohi.jpg
Atha Mechina Aluudu, , Sri Lalitha Kalanjali, -, Krishna ; JayaPradha, -, -, K V Mahadevan, -, -, -, Kodi Ramakrishna, Vakada Appa Rao, 1/19
/1989, -, Telugu, \images\Atha Mechina Aluudu.gif
Daggubati Rana & Suresh Babu New Movie, , Suresh Productions Pvt. Ltd. , -, Rana Daggubati , -, -, -, -, -, -, Anand Ranga, Daggubati Suresh
Babu , -, -, Telugu, \images\Daggubati Rana & Suresh Babu New Movie.gif
Anveshana, , RamKumar Productions, -, Bhanu Priya; Karthik, Kaikala Satyanarayana; Rallapalli; Sharath Babu; Subhalekha Sudhakar ; M Mallika
rjuna Rao; Y Vijaya; K Viswanatham; Balaji, -, Ilayaraja, G R Anil Malnad, M V Raghu, -, Vamsi, Kamineni Prasad, -, -, Telugu, –
Chasastapu Mogudu, , Rajyalakshmi Cine Arts, -, Suman; Bhanu Priya, -, -, -, -, -, -, Sarath, Midha Rama Rao, -, -, Telugu, \images\Chasasta
pu Mogudu.gif
Dabbu Bale Jabbu, , Allu Arts, -, Raj Kumar; SumaLatha, Brahmanandam ; Hema, -, -, -, -, -, K S Rajendra, Allu Ramalingaiah , -, -, Telugu,
\images\Dabbu Bale Jabbu.jpg
Bharateeyudu, , Sri Surya Movies, -, Kamal Hassan; Manisha Koirala; Urmila, Kasthuri ; Sukanya; Goudarmani; Manorama; Nedumudi Venu, Sujatha
, A R Rahman, V T Vijayan, Jeeva Shankar, Shankar , Shankar , A M Rathnam, 8/23/1996, -, Telugu, \images\Bharateeyudu.jpg
Lakshmi Kataksham, , P S R, -, N T RamaRao; K R Vijaya; Rajasri, Kaikala Satyanarayana; Prabhakar Reddy; Balayya; Mikkilineni; Hemalatha, Ch
illara Bhavan Narayana Rao, S P Kodandapani, K Govinda Swamy, H S Venu, B Vitala charya, B Vitala charya, Pinjala Subba Rao, 3/12/1970, -, T
elugu, –
Bhakta Kabir, , Chamrima Takies, -, Seeta Rama Anjaneyulu Chilakalpudi, -, -, -, -, -, -, Rai Mothilal Chamriya; Chitrapu Narayana Rao, Rai
Mothilal Chamriya, -, -, Telugu, \images\Bhakta Kabir.gif
Attinti Sapam, , Modern Theaters, -, Saroja, -, -, -, -, -, -, Acharaya M Mastan, T R Sundaram, -, -, Telugu, \images\Attinti Sapam.gif
C:\>

[/code]

Leave a Reply

Your email address will not be published. Required fields are marked *