PERL   34

html2latex

Guest on 23rd June 2022 02:59:58 AM

  1. #!/usr/bin/perl -w
  2.  
  3. #   html2latex
  4. #   Copyright (C) Peter Thatcher
  5.  
  6. #   This program is free software; you can redistribute it and/or
  7. #   modify it under the terms of the GNU General Public License
  8. #   as published by the Free Software Foundation; either version 2
  9. #   of the License, or (at your option) any later version.
  10. #   This program is distributed in the hope that it will be useful,
  11. #   but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13. #   GNU General Public License for more details.
  14.  
  15. #   You should have received a copy of the GNU General Public License
  16. #   along with this program; if not, write to the Free Software
  17. #   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  18.  
  19. ###################### BEGIN SETUP #########################
  20.  
  21.  
  22. use strict;
  23. use Getopt::Long;
  24. use File::Basename;
  25.  
  26. my $usage = << 'STOP';
  27. Usage: html2latex [options] <filename>
  28.  
  29. Optional Parameters:
  30.     --help -h ?                         print this help
  31.     --image_scale --image -i <float>    set image scale
  32.     --font_size   --font  -f <integer>  set font size.  Must be 10-12
  33.     --debug  -d                         print debugging info
  34.     --table_border --border     sets table around borders
  35.     --document_class --document
  36.     --class=s <string>                  sets latex document class
  37.     --package  <string>                 adds a latex package each time
  38.     --latex2pdf --pdf -p                creates a pdf and latex file
  39.     --paragraph --par -P                sets the style of the paragraph to
  40.                                         HTML style
  41.     --ban -b <string>                   ban (ignore) tag
  42.     --head <string>                     add option to documentclass defenition
  43.     --conf -C <string>                  use alternate configuration file
  44.     --log -l <string>                   use alternate log file
  45. STOP
  46.  
  47. ################## BEGIN OPTION PARSING ####################
  48.  
  49. my %options  = ();
  50. my @packages = ();
  51. my @heads    = ();
  52. my @banned   = ();
  53. my $latex2pdf;
  54. my $conffile;
  55. my $logfile;
  56. GetOptions(
  57.            'image_scale|image|i=f'           => \$options{image},
  58.            'font_size|font=i'                => \$options{font_size},
  59.            'debug|d+'                        => \$options{debug},
  60.            'table_border|table|border!'      => \$options{border},
  61.            'document_class|document|class=s' => \$options{document_class},
  62.            'help|?|h'                        => sub {print $usage and exit},
  63.            'package=s'                       => \@packages,
  64.            'head=s'                          => \@heads,
  65.            'ban|b=s'                         => \@banned,
  66.            'latex2pdf|pdf|p!'                => \$latex2pdf,
  67.            'mbox|m!'                         => \$options{mbox},
  68.            'cache|local'                     => \$options{cache},
  69.            'paragraph|par|P!'                => \$options{paragraph},
  70.            'conf|C=s'                        => \$conffile,
  71.            'log|l=s'                         => \$logfile
  72.            );
  73.  
  74. print $usage and exit unless @ARGV;
  75. chomp(my $START_DIR = `pwd`);       #place we start out from
  76.  
  77. ################### END OPTION PARSING ####################
  78.  
  79. ###################### BEGIN MAIN #########################
  80.  
  81. use HTML::Latex;
  82.  
  83. my $parser = new HTML::Latex();
  84. $parser->set_option(\%options);
  85. $parser->add_package(@packages);
  86. $parser->add_head(@heads);
  87. $parser->ban_tag(@banned);
  88. $parser->set_log($logfile) if defined($logfile);
  89. foreach my $uri (@ARGV) {
  90.     #Option 1:
  91.     #my $in = IO::File->new("< $uri");
  92.     #my $out = IO::File->new("> $uri.tex");
  93.     #$parser->html2latex($in,$out);
  94.  
  95.     #Option 2:
  96.     print "Processing $uri\n";
  97.     my ($htmlfile,$latexfile) = $parser->html2latex($uri);
  98.     latex2pdf($htmlfile,$latexfile) if $latex2pdf;
  99. }
  100.  
  101. #Option 3:
  102. #my $html_string = join("\n",<>);
  103. #my $html_string = join("",@ARGV);
  104. #my $tex_string = $parser->parse_string($html_string,1);
  105. #print $tex_string;
  106.  
  107. ####################### END MAIN ##########################
  108.  
  109. ####################### BEGIN SUBS ########################
  110.  
  111. #Routine to print help
  112.  
  113. # runs the system command "pdflatex" after checking for existing files
  114. # and backing them up.
  115. # <1> The base of the filename
  116. sub latex2pdf {
  117.     my ($htmlfile,$latexfile) = @_;
  118.     my ($base,$path,$extension) = fileparse($latexfile,'\.tex');
  119.     my $texrel  = "$base$extension";   #relative name of the tex file
  120.     my $logfile = "$path$base.log";
  121.     my $pdffile = "$path$base.pdf";
  122.  
  123.     print "PDF: Processing $latexfile from $htmlfile and $texrel\n";
  124.     chdir($path);  #pdflatex, no matter what I do always insists on making the output
  125.                    #files in the current directory.  So here is my workaround.
  126.     my @errors = grep /^!/, `pdflatex -interaction=nonstopmode $texrel`;
  127.     chdir($START_DIR);
  128.    
  129.     if(@errors){
  130.         print "PDF: pdflatex reported " . scalar(@errors) . " errors while creating PDF.\n\tCheck $logfile for more information.\n";
  131.     }
  132. }
  133.  
  134. =head1 NAME
  135.  
  136. html2latex - HTML to latex converter.
  137.  
  138. =head1 SYNOPSIS
  139.  
  140. html2latex [OPTION]... URLS...
  141.  
  142. =head1 DESCRIPTION
  143.  
  144. html2latex uses HTML::TreeBuilder to parse an HTML file and then it
  145. converts the HTML::Element into to a Latex file.  Each URL will have a
  146. .*html extension stripped.  If you use a URL, then the files taken
  147. from the Internet will be stored in your ~/.html2latex directory.  If
  148. pictures are included, they are converted to .PNG, which can only be
  149. used with pdflatex. As an added bonus, there is an option to
  150. automatically create a PDF from the Latex file (using pdflatex).
  151.  
  152. =head1 REQUIRES
  153.  
  154. If your html2latex is not working correctly, this may be because you
  155. do not have many of the needed packages.  html2latex requires
  156. HTML::TreeBuilder perhaps LWP::Simple and URI.  If you do not have
  157. either of these, try typing B<perl -MCPAN -e shell> at the command
  158. line.  This will bring up a shell for CPAN (The Comprehensive Perl
  159. Archive Network). Then, as root try typing B<install
  160. HTML::TreeBuilder>. Should work like magic.
  161.  
  162. =head1 URLS
  163.  
  164. In your list of URLs any filename given after a URL will continue to
  165. use the latest HOST given.  Also, files default to index.html,
  166. regardless of what the server thinks.  So, if you type:
  167.  
  168. C<html2latex http://slashdot.org foo.html http://linuxtoday.net bar.html>
  169.  
  170. html2latex will try to grab http://slashdot.org/index.html,
  171. http://slashdot.org/foo.html, http://linuxtoday.net/index.html, and
  172. http://linuxtoday.net/bar.html
  173.  
  174. =head1 OPTIONS
  175.  
  176. Options are secondary to document-specified options.  So, if your HTML
  177. file has border=1, a border will be printed regardless of the
  178. B<--border> option.  The do overide, however, options given in the
  179. configuration file.  If you want to change things more permanently,
  180. try changing the config file, html2latex.xml.  For information on it,
  181. try the HTML::Latex under section CONFIGURATION FILE.
  182.  
  183. =over 4
  184.  
  185. =item B<-h -? --help>
  186.  
  187. Print the brief help and usage.
  188.  
  189. =item B<--latex2pdf --pdf -p>
  190.  
  191. Automatically create a PDF along with a latex file named FILE.pdf.
  192. This may fail and print a number of cryptic errors.
  193.  
  194. =item B<-i --image --image_scale=SCALE>
  195.  
  196. Set the scale for images in the latex file.  This is usefull because
  197. some images in HTML or much to big to fit on a page.  Default is 1.0.
  198. SCALE can be any non-zere positive floating point number, large
  199. numbers are not reccomended.
  200.  
  201. =item B<-f --font --font_size=SIZE>
  202.  
  203. Set the default font size.  Can be 10-12.  Do not try anything else.
  204. html2latex will not check it, but the latex file will not compile (at
  205. least I think not).  Default is 12.
  206.  
  207. =item B<-d --debug>
  208.  
  209. Level of debugging info to print.  The more times this option is used,
  210. the higher the level.  Default is 0, and you cannot lower that.  Right
  211. now, 0 prints nothing.  1 prints fun code-tracking info.  2 prints
  212. lots of data-structure information, so don't do it unless you're
  213. serious.
  214.  
  215. =item B<--border --table --table_border>
  216.  
  217. Sets table around borders on.  Default is off.  Also, B<--noborder> or
  218. B<--notable> will explicity turn table borders off.
  219.  
  220. =item B<--class --document --document_class=CLASS>
  221.  
  222. Set the documentclass to use.  Any valid latex document class is
  223. valid.  Examples are B<report>, B<book>, and B<article>.  B<article>
  224. is the default.  If an invalid document class is used, the output
  225. latex file will not compile.
  226.  
  227. =item B<--package=PACKAGE>
  228.  
  229. html2latex will create a latex file using any packages that you
  230. specify.  PACKAGE will be added to the list of class to put in the
  231. file.  html2latex will not make sure the packages are valid, but if
  232. they aren't the latex file won't compile.
  233.  
  234. =item B<--head=HEAD>
  235.  
  236. Latex allows you to add options in the preamble of the form
  237. \documentclass[OPTIONS]{article}.  Each HEAD you add gets added to the
  238. list included.  For instance, you could use C<--head=twocolumn> to add
  239. the 'twocolumn' feature of Latex.  Since font sizes are already added,
  240. don't add them yourself.  See C<--font>
  241.  
  242. =item B<--mbox -m>
  243.  
  244. With any of these, html2latex will put a tex \mbox around all of the
  245. tables it creates.  I do not know why, but with a lot of tables
  246. (especially nested ones), the tex and pdf output will work better.
  247. So, if you do not like your output with tables, try this.
  248.  
  249. =item B<--paragraph --par -P>
  250.  
  251. Uses HTML-style paragraphs.  This is by default, so try --noparagrph
  252. or --nopar or -P! to turn it back to Latex-style paragraphs.
  253.  
  254. =item B<--cache --local>
  255.  
  256.  
  257. =item B<--log -l LOGFILE>
  258.  
  259. Print all messages to LOGFILE instead of STDERR.
  260.  
  261. =item B<--conf -C CONFFILE>
  262.  
  263. Change the configuration file to CONFFILE.  For more information on
  264. this file, see the HTML::Latex manpage.
  265.  
  266. =back
  267.  
  268. =head1 Development
  269.  
  270. Development is being carried out by Peter Thatcher
  271. (peterthatcher@asu.edu) and Stan Seibert (volsung@asu.edu).  Homepage
  272. is http://html2latex.sourceforge.net.
  273.  
  274. =cut

Raw Paste


Login or Register to edit or fork this paste. It's free.