Perl script: HtmlAsText.pl to convert HTML content into Text File format

This program is to illustrate how tools like HtmlAsText usually work.

[code language=”perl”]
#! perl
#===============================================================================
# Objective:
# ———-
#
# Script to convert HTML File content into Text File
#
#
# $Header: $
#===============================================================================
# Include Modules
#===============================================================================
use strict;
use warnings;
use Pod::Usage;
use LWP::Simple;
use File::Basename;
use HTML::Table;
use Alvis::HTML;
use HTML::TableExtract qw(tree);
use Getopt::Long qw(:config no_ignore_case bundling);
#===============================================================================
# Global Variables Declaration
#===============================================================================
use vars qw($DEBUG $SRC_FLDR $DEST_FLDR $HTML_TBL_OBJ $ALVIS_HTML_OBJ);
#===============================================================================
# Prototypes Section
#===============================================================================
sub DoAction;
sub My_Readdir;
sub InitGlobals;
sub ProcessArgs;
sub Info {my ($mesg) = @_; print STDOUT "INFO: $mesg\n";}
sub MyErr {my ($mesg) = @_; print STDERR "ERROR, $mesg\n";}
sub MyWarn {my ($mesg) = @_; print STDOUT "WARNING, $mesg\n";}
sub MyDie {my ($mesg) = @_; print STDERR "ERROR, $mesg\n"; exit(1);}
sub Debug {my ($mesg) = @_; print STDOUT "DEBUG, $mesg\n" if $DEBUG;}
#===============================================================================
# main()
#===============================================================================
{
InitGlobals();
ProcessArgs();
DoAction();
exit(0);
}
#===============================================================================
# sub InitGlobals
#===============================================================================
sub InitGlobals {
$HTML_TBL_OBJ = HTML::TableExtract->new();
$ALVIS_HTML_OBJ = Alvis::HTML->new(
alvisKeep => 1,
alvisRemove => 1,
obsolete => 1,
proprietary => 1,
xhtml => 1,
wml => 1,
keepAll => 0,
assertHTML => 1,
convertCharEnts => 1,
convertNumEnts => 1,
sourceEncoding => undef,
cleanWhitespace => 1,
assertSourceAssumptions => 1
);
}
#===============================================================================
# sub ProcessArgs
#===============================================================================
sub ProcessArgs {
Getopt::Long::Configure("bundling", "no_ignore_case");
if (! GetOptions(‘D’ => \$DEBUG,
‘s=s’ => \$SRC_FLDR,
‘d=s’ => \$DEST_FLDR,
‘h|?’ => sub { &pod2usage(-verbose => 2)}) || @ARGV ) {
pod2usage(2);
}

MyDie("Specify the folder path of html files to be read") if (! $SRC_FLDR);
$SRC_FLDR =~ s/\\/\\\\/g;
$DEST_FLDR = $SRC_FLDR if (! $DEST_FLDR);
}
#===============================================================================
# sub DoAction
#===============================================================================
sub DoAction {
my %html_files = ();
My_Readdir($SRC_FLDR, \%html_files);

if (! scalar keys %html_files) {
Info("There are no files in the specified folder path: [$SRC_FLDR]");
exit 0;
}

foreach my $html_file (sort keys %html_files) {
next if (! -e "$SRC_FLDR\\$html_file");
Debug("Parsing File: [$html_file]") if $DEBUG;
ConvertHtml2Json($html_file);
}

exit 0;
}
#===============================================================================
# sub ConvertHtml2Json
#
#
#===============================================================================
sub ConvertHtml2Json {
my $input_file = "$SRC_FLDR\\" . shift;
my $bare_file_name = basename($input_file);
my $output_file = "$DEST_FLDR\\$bare_file_name.json";

if (! open(HTML_FILE, "<$input_file")) {
MyErr("Failed to open $input_file file for reading: $!");
return 0;
}

my $html_contents;
while(<HTML_FILE>) {
$html_contents .= $_;
}
close HTML_FILE;
my ($plain_txt, $header) = $ALVIS_HTML_OBJ->clean($html_contents,
{title=>1, baseURL=>1});
print "Header:\n\n";
foreach (keys %$header) {
print "$_ => \t $$header{$_}\n";
}

if (! $plain_txt) {
MyWarn("Outputting the Alvis records failed, Exiting. " .
$ALVIS_HTML_OBJ->errmsg() );
$ALVIS_HTML_OBJ->clearerr();
exit 1;
}
print "HTML as Plain text:\n\n$plain_txt\n\n";

}
#===============================================================================
# sub My_Readdir
#
# Read the contents of a directory and populate in a hash
#===============================================================================
sub My_Readdir {
my ($srcdir, $hashref) = @_;
if (! opendir(DIR, $srcdir)) {
My_Log("opendir() failed for $srcdir: $!", 1);
return;
}
Debug("Reading list of files from folder: $srcdir");
while (defined (my $file = readdir DIR)) {
next if $file =~ /^\.\.?$/; # Skip . and ..
next if $file !~ /.html$/; # Skip . and ..
$$hashref{$file} = 0;
}
closedir(DIR);
}
#===============================================================================
# documentation
#===============================================================================
__END__
=head1 NAME
B<.pl> – .
=head1 SYNOPSIS
.pl [- <>][-h] [-D]
=head1 OPTIONS
=over 4
=item B<-D>
Run in debug mode.
=item B<-> <>
=item B<-h>
Print this help message.
=back
[/code]

Leave a Reply

Your email address will not be published. Required fields are marked *