#!/usr/bin/perl -w use strict; use Getopt::Long; use Text::Wrap; my( @lines, $text, $width, $whole, $i, $character, $year, $isodate, $translationfile, $infile, $outfile, $pagetitle, $char, $blankline, $option, %opts ); #################### initialize some values ########################################### $whole = ""; $i = 0; $translationfile = "/home/username/lib/char/transtable.text"; ### Get command line options and print usage if necessary ### most of these options don't have a function yet Getopt::Long::Configure('bundling'); if( !(GetOptions ( \%opts, "f|file", "v|verbose", "l|long", "h|help",)) || defined($opts{h})) { print STDERR "Usage: xhtmldoc [options] [file]\n"; print STDERR "Convert MS Word doc to valid XHTML.\n"; print STDERR "\nOptions:\n"; print STDERR "\t-f, --file \twrite output to file instead of STDOUT\n"; print STDERR "\t-v, --verbose \tdebuggin info!\n"; print STDERR "\t-l, --long \tdon't delete junk characters\n"; print STDERR "\t-h, --help \tprint this usage info\n"; print STDERR "\n"; exit; } ################################################################################ ################### START Main code Block ###################################### ################################################################################ if (not (defined($ARGV[0]))) { ## needs incoming file print "Syntax error: No doc file defined\n"; exit; } else { ## has file, do conversions $infile = $ARGV[0]; $text = &ReadFile( $infile ); $text = &Clean( $text ); $text = &Trans( $text, $translationfile ); $text = &Style( $text ); $text = WrapFormat ($text); ################################### output the XHTML #################################### if( defined( $opts{"f"})) { ## output goes to a file if ( $ARGV[0] =~ m/^(.*)\.doc/ ) { ## make the filename to write to $outfile = "$1".".html"; } else { $outfile = "newfile.html"; } print "$outfile\n"; ## filename to STDOUT open(OUTFILE, "> $outfile") || die "can't opendir $outfile: $!"; select OUTFILE; &Output($pagetitle, $text); ## actually prints the stuff close OUTFILE; } else { ## output goes to STDOUT &Output($pagetitle, $text); ## actually prints the stuff exit; } } ################################################################################ ################### END Main code Block ####################################### ################################################################################ ####################################################################################### ### Read the input from a file ### $wholefile = &ReadFile( $filename ); ###################################################################################### sub ReadFile { my( $filename, $opt) = @_; my( $line, $wholefile ); $wholefile = ""; if (defined $filename) { open(THATFILE, $filename) || die "can't open $filename: $!";; while( defined( $line = )) { $wholefile .= $line; # add the line to the big text string } close(THATFILE); } else { while( defined( $line = )) { $wholefile .= $line; # add the line to the big text string } } return $wholefile; } ####################################################################################### ### ($text) = &Clean ( $text, $option); gets rid of obvious junk ####################################################################################### sub Clean { my( $line, $opt) = @_; $line =~ s/(\377){8}//g; # delete 4 or more consecutive M-^? $line =~ s/(\c@){2,}//g; # delete 4 or more Control AT's return $line; } ############################################################################################### ### ($text) = &Trans ( $text, $charfile, $option,); ### Translates characters from conversion file. ############################################################################################## sub Trans { my ($text, $charfile) = @_; my (@characters, $charline, $line, $length, $high, $sgml, $single); open(THATFILE, $charfile) || die "can't open $charfile: $!";; ########### gets the translation info from the file ############### while( defined( $line = )) { # print "$line
\n"; next if( $line =~ m/^\s*#/); # ignore commented lines next if( $line =~ m/^$/); # ignore empty lines chomp($line); push( @characters, $line); # fill the array with info } close(THATFILE); ########### use the translation table to modify the text ########## $length = @characters; while (defined ($charline = shift(@characters))) { # print "$charline"; $charline =~ m/^(\d{3})\s+(&#?\w+\;)/; $high = "\\"."$1"; $sgml = $2; # print "high: -$high- sgml: -$sgml- \n"; $text =~ s/$high/$sgml/g; } return ($text); } ####################################################################################### ### ($text) = &Style ( $text, $option); ### format text for your preferences ####################################################################################### sub Style { my( $line, $opt) = @_; my ( $title); ## $line =~ s/^.*?[PF]\cA\c@\cO//s; # delete header ## $line =~ s/[PF]\cA\c@\cO.*?$//s; # delete footer # $line =~ s/\cC \cB/\n/g; # turn soft line breaks into real newlines # $line =~ s/\cC \cC/\n/g; # turn soft line breaks into real newlines ## $line =~ s/\cM{2,}/\n\n<\/p>

\n\n/g; # multiple returns into paragraps $line =~ s/(\c@){2,8}(.{8,10})/\n\n$2/; # tries to put space at the beginning of text $line =~ s/~\n//g; # gets rid of "twidle newline" # $line =~ s/\c@\n//g; # gets rid of "control at newline" ####### title identification stuff ###################### $line =~ s/\W*(\w?([\w ]{7,})\cM)/\n\n$1/; # try to put newlines before the first actual words $line =~ s/(\c@){3}(([\w ]{7,})\cM)/$1\n\n$2/; # possibly finds the title $title = $2; # grabs the found title # $title =~ s/\b(\w)(\w*)/$1.lc($2)/ge; # huh? # $title =~ s/^(.+)Of/$1of/g; # if Of isn't the first word of the title # (\cM)* # tries to put space at the beginning of text ####### paragraph and structure/style formatting ######### $line =~ s/\cM\s*\cM{1,}/\n\n<\/p>

\n\n/g; # multiple returns into paragrap boundaries $line =~ s/\cM/\n/g; # single returns into forced BReaks $line =~ s/\cL/

/sg; # requires human pair pattern matching # $line =~ s/\cL(.*)\cL/
$1<\/lockquote>/sg; ######## Et Cetera ####################################################### $line =~ s/\x97/-/g; ######## Cleanup remaining unused characters ############################### ## these are intentionally commented out so that I can identify any ## new special word characters and find out what to turn them into. ## $line =~ s/[\000-\011]//g; # remove remaining low-ascii characters ## $line =~ s/[\013-\037]//g; # remove remaining low-ascii characters # $line =~ s/\013-\037//g; # remove low control-characters but newlines. # $line =~ tr [\240-\376] # [\040-\176]; # turns "Meta-Alpha" into "Alpha" # $line =~ tr/\240-\376/\040-\176/; # turns "Meta-Alpha" into "Alpha" # $line =~ tr/\200-\377//d; # remove all remaining high ascii characters # return ($line, $title); return ($line); } ####################################################################################### ### Formats the long lines so that they wrap ### $fits = &WrapFormat( $longline, $option); ####################################################################################### sub WrapFormat { my( $line, $opt) = @_; my ( $paragraph, @paragraph, $temp, $newtext); @lines = split /\n/, $line; $newtext = ''; foreach (@lines) { $i++; $paragraph = $_; if( length( $paragraph) > 80) { @paragraph = split /\n/, $paragraph; $Text::Wrap::columns = 72; $temp = wrap('', '', @paragraph); $newtext = $newtext.$temp."\n"; } else { $newtext = $newtext.$paragraph."\n"; } } return ($newtext); } ####################################################################################### ### Prints all the output ####################################################################################### sub Output { my( $title, $text, $opt) = @_; ## print "Content-Type: text/html;charset=UTF-8\n\n"; print <<'END'; END ## print "\n"; print "<\/title>\n"; print "<\/head>\n\n"; print "<body class=\"document\">\n\n<h1><\/h1> \n"; print "<div class=\"text\"> \n\n\n"; print $text; print "\n\n\n"; print <<'END'; </body> </html> END } =head1 Symopsis This program converts Microsoft Word files to xhtml 1.1 2003-03-07 started 2003-03-08 updated 2003-03-09 updated 2003-06-09 bugfix: '[' ']' characters getting deleted by bad /tr/ syntax. 2003-07-09 cleaning up / generalize code for "release" This program is governed by the GPL http://www.gnu.org/licenses/gpl.txt brett hamilton --- http://simple.be/software/xhtmldoc/ =cut