ScriptingWeb

Perl script: HtmlAsText.pl to convert HTML content into Text File format

This program is to illustrate how tools like HtmlAsText usually work.

#! perl
#===============================================================================
# Objective:
# ----------
#
# Script to convert HTML File content into Text File
#
#
# $Header: $
#===============================================================================
# Include Modules
#===============================================================================
use strict;
use warnings;
use Pod::Usage;
use LWP::Simple;
use File::Basename;
use HTML::Table;
use Alvis::HTML;
use HTML::TableExtract qw(tree);
use Getopt::Long qw(:config no_ignore_case bundling);
#===============================================================================
# Global Variables Declaration
#===============================================================================
use vars qw($DEBUG $SRC_FLDR $DEST_FLDR $HTML_TBL_OBJ $ALVIS_HTML_OBJ);
#===============================================================================
# Prototypes Section
#===============================================================================
sub DoAction;
sub My_Readdir;
sub InitGlobals;
sub ProcessArgs;
sub Info   {my ($mesg) = @_; print STDOUT "INFO:  $mesg\n";}
sub MyErr  {my ($mesg) = @_; print STDERR "ERROR, $mesg\n";}
sub MyWarn {my ($mesg) = @_; print STDOUT "WARNING, $mesg\n";}
sub MyDie  {my ($mesg) = @_; print STDERR "ERROR, $mesg\n"; exit(1);}
sub Debug  {my ($mesg) = @_; print STDOUT "DEBUG, $mesg\n" if $DEBUG;}
#===============================================================================
# main()
#===============================================================================
{
    InitGlobals();
    ProcessArgs();
    DoAction();
    exit(0);
}
#===============================================================================
# sub InitGlobals
#===============================================================================
sub InitGlobals {
    $HTML_TBL_OBJ   = HTML::TableExtract->new();
    $ALVIS_HTML_OBJ = Alvis::HTML->new(
                        alvisKeep   =>  1,
                        alvisRemove =>  1,
                        obsolete    =>  1,
                        proprietary =>  1,
                        xhtml       =>  1,
                        wml         =>  1,
                        keepAll     =>  0,
                        assertHTML  =>  1,
                        convertCharEnts =>  1,
                        convertNumEnts  =>  1,
                        sourceEncoding  =>  undef,
                        cleanWhitespace =>  1,
                        assertSourceAssumptions =>  1
                       );
}
#===============================================================================
# sub ProcessArgs
#===============================================================================
sub ProcessArgs {
    Getopt::Long::Configure("bundling", "no_ignore_case");
    if (! GetOptions('D'   => \$DEBUG,
                     's=s' => \$SRC_FLDR,
                     'd=s' => \$DEST_FLDR,
                     'h|?' => sub { &pod2usage(-verbose => 2)}) || @ARGV ) {
        pod2usage(2);
    }
    
    MyDie("Specify the folder path of html files to be read") if (! $SRC_FLDR);
    $SRC_FLDR =~ s/\\/\\\\/g;
    $DEST_FLDR = $SRC_FLDR if (! $DEST_FLDR);
}
#===============================================================================
# sub DoAction
#===============================================================================
sub DoAction {
    my %html_files = ();
    My_Readdir($SRC_FLDR, \%html_files);
    
    if (! scalar keys %html_files) {
        Info("There are no files in the specified folder path: [$SRC_FLDR]");
        exit 0;
    }
    
    foreach my $html_file (sort keys %html_files) {
        next if (! -e "$SRC_FLDR\\$html_file");
        Debug("Parsing File: [$html_file]") if $DEBUG;
        ConvertHtml2Json($html_file);
    }
    
    exit 0;
}
#===============================================================================
# sub ConvertHtml2Json
#
# 
#===============================================================================
sub ConvertHtml2Json {
    my $input_file      = "$SRC_FLDR\\" . shift;
    my $bare_file_name  = basename($input_file);
    my $output_file     = "$DEST_FLDR\\$bare_file_name.json";
    
    if (! open(HTML_FILE, "<$input_file")) {
        MyErr("Failed to open $input_file file for reading: $!");
        return 0;
    }
    
    my $html_contents;
    while(<HTML_FILE>) {
        $html_contents .= $_;
    }
    close HTML_FILE;
    my ($plain_txt, $header)    = $ALVIS_HTML_OBJ->clean($html_contents,
                                                    {title=>1, baseURL=>1});
    print "Header:\n\n";
    foreach (keys %$header) {
        print "$_ => \t $$header{$_}\n";
    }
    
    if (! $plain_txt) {
        MyWarn("Outputting the Alvis records failed, Exiting. " .
                $ALVIS_HTML_OBJ->errmsg() );
        $ALVIS_HTML_OBJ->clearerr();
        exit 1;
    }
    print "HTML as Plain text:\n\n$plain_txt\n\n"; 
   
}
#===============================================================================
# sub My_Readdir
#
# Read the contents of a directory and populate in a hash
#===============================================================================
sub My_Readdir {
    my ($srcdir, $hashref) = @_;
    if (! opendir(DIR, $srcdir)) {
        My_Log("opendir() failed for $srcdir: $!", 1);
        return;
    }
    Debug("Reading list of files from folder: $srcdir");
    while (defined (my $file = readdir DIR)) {
        next if $file =~ /^\.\.?$/; # Skip . and ..
        next if $file !~ /.html$/; # Skip . and ..
        $$hashref{$file} = 0;
    }
    closedir(DIR);
}
#===============================================================================
# documentation
#===============================================================================
__END__
=head1 NAME
B<.pl> - .
=head1 SYNOPSIS
.pl [- <>][-h] [-D]
=head1 OPTIONS
=over 4
=item B<-D>
Run in debug mode.
=item B<-> <>
=item B<-h>
Print this help message.
=back
Advertisements

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.