PERL   86

stripsgml

Guest on 25th August 2022 01:53:44 PM

  1. #! /usr/local/bin/perl
  2. ##---------------------------------------------------------------------------##
  3. ##  File:
  4. ##      stripsgml
  5. ##  Author:
  6. ##      Earl Hood       ehood@convex.com
  7. ##  Description:
  8. ##      Remove SGML markup.  Reads from STDIN, writes to STDOUT.
  9. ##      Character entities are converted to ASCII equivalents.  However,
  10. ##      due to the nature of some special characters, some special
  11. ##      characters may get lost in the output.
  12. ##
  13. ##  Usage:
  14. ##      % stripsgml [-html] < file.sgml > file.txt
  15. ##
  16. ##      The -html option cause URLs in anchor elements in an HTML document
  17. ##      be preserved in the text output.
  18. ##
  19. ##  Changes
  20. ##      0.1.1:  Syntax changes for execution under perl 5.
  21. ##
  22. ##---------------------------------------------------------------------------##
  23. ##  Copyright (C)   Earl Hood, ehood@convex.com
  24. ##
  25. ##  This program is free software; you can redistribute it and/or modify
  26. ##  it under the terms of the GNU General Public License as published by
  27. ##  the Free Software Foundation; either version 2 of the License, or
  28. ##  (at your option) any later version.
  29. ##
  30. ##  This program is distributed in the hope that it will be useful,
  31. ##  but WITHOUT ANY WARRANTY; without even the implied warranty of
  32. ##  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  33. ##  GNU General Public License for more details.
  34. ##
  35. ##  You should have received a copy of the GNU General Public License
  36. ##  along with this program; if not, write to the Free Software
  37. ##  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  38. ##---------------------------------------------------------------------------##
  39.  
  40. package main;
  41.  
  42. ##---------------------------------------------------------------------------##
  43. ## Store name of program ##
  44. ($PROG = $0) =~ s/.*\///;
  45. $VERSION = "0.1.1";
  46.  
  47. ## Require libraries ##
  48. unshift(@INC, '.', '/share/ilash/corpora/SGML/perlSGML/lib');
  49. require "sgml.pl" || die "Unable to require sgml.pl\n";
  50.  
  51. ##---------------------------------------------------------------------------##
  52. ## Associative array for entity substitutions
  53. ##
  54. %Entity = (
  55.  
  56. #-----------------------------
  57. # Numeric and Special Graphic
  58. #-----------------------------
  59. #   Entity      ASCII substitution
  60. #   ----------  ------------------
  61.     "half",     "1/2",  # fraction one-half
  62.     "frac12",   "1/2",
  63.     "#189",     "1/2",
  64.     "frac14",   "1/4",  # fraction one-quarter
  65.     "#188",     "1/4",
  66.     "frac34",   "3/4",  # fraction three-quarters
  67.     "#190",     "3/4",
  68.     "frac18",   "1/8",  # fraction one-eighth
  69.     "frac38",   "3/8",  # fraction three-eighths
  70.     "frac58",   "5/8",  # fraction five-eighths
  71.     "frac78",   "7/8",  # fraction seven-eighths
  72.     "sup1",     "[1]",  # superscript one
  73.     "#185",     "[1]",
  74.     "sup2",     "[2]",  # superscript two
  75.     "#178",     "[2]",
  76.     "sup3",     "[3]",  # superscript three
  77.     "#179",     "[3]",
  78.     "plus",     "+",    # plus sign
  79.     "plusmn",   "+/-",  # plus-or-minus sign
  80.     "lt",       "<",    # less-than sign
  81.     "equals",   "=",    # equals sign
  82.     "gt",       ">",    # greater-than sign
  83.     "divide",   "/",    # divide sign
  84.     "times",    "*",    # times sign
  85.     "curren",   "\$",   # general currency sign
  86.     "#164",     "\$",
  87.     "pound",    "L",    # pound sign
  88.     "#163",     "L",
  89.     "dollar",   "\$",   # dollar sign
  90.     "cent",     "c",    # cent sign
  91.     "#162",     "c",
  92.     "yen",      "Y",    # yen sign
  93.     "#165",     "Y",
  94.     "num",      "#",    # number sign
  95.     "percent",  "%",    # percent sign
  96.     "amp",      "&",    # ampersand
  97.     "ast",      "*",    # asterisk
  98.     "commat",   "@",    # commercial at
  99.     "lsqb",     "[",    # left square bracket
  100.     "bsol",     "\\",   # reverse solidus
  101.     "rsqb",     "]",    # right square bracket
  102.     "lcub",     "{",    # left curly bracket
  103.     "verbar",   "|",    # vertical bar
  104.     "rcub",     "}",    # right curly bracket
  105.     "copy",     "(C)",  # copyright sign
  106.     "#169",     "(C)",
  107.     "reg",      "(R)",  # registered sign
  108.     "#174",     "(R)",
  109.     "trade",    "(TM)", # tradmark
  110.     "brvbar",   "|",    # broken vertical bar
  111.     "#166",     "|",
  112.     "excl",     "!",    # exclamation mark
  113.     "quot",     '"',    # quotation mark
  114.     "apos",     "'",    # apostrophe
  115.     "lpar",     "(",    # left parenthesis
  116.     "rpar",     ")",    # right parenthesis
  117.     "comma",    ",",    # comma
  118.     "lowbar",   "_",    # low line
  119.     "hyphen",   "-",    # hyphen
  120.     "period",   ".",    # full stop, period
  121.     "sol",      "/",    # solidus
  122.     "colon",    ":",    # colon
  123.     "semi",     ";",    # semicolon
  124.     "quest",    "?",    # question mark
  125.     "laquo",    "<<",   # angle quotation mark, left
  126.     "&#171",    "<<",
  127.     "raquo",    ">>",   # angle quotation mark, right
  128.     "#187",     ">>",
  129.     "lsquo",    "'",    # single quotation mark, left
  130.     "rsquo",    "'",    # single quotation mark, right
  131.     "ldquo",    '"',    # double quotation mark, left
  132.     "rdquo",    '"',    # double quotation mark, right
  133.     "nbsp",     " ",    # no break (required) space
  134.     "#160",     " ",
  135.     "shy",      "-",    # soft hyphen
  136.     "#173",     "-",
  137.  
  138. #---------------
  139. # Added Latin 1
  140. #---------------
  141. #   Entity      ASCII substitution
  142. #   ----------  ------------------
  143.     "aacute",   "a",    # small a, acute accent
  144.     "#225",     "a",
  145.     "Aacute",   "A",    # capital A, acute accent
  146.     "#193",     "A",
  147.     "acirc",    "a",    # small a, circumflex accent
  148.     "#226",     "a",
  149.     "Acirc",    "A",    # capital A, circumflex accent
  150.     "#194",     "A",
  151.     "agrave",   "a",    # small a, grave accent
  152.     "#224",     "a",
  153.     "Agrave",   "A",    # capital A, grave accent
  154.     "#192",     "A",
  155.     "aring",    "a",    # small a, ring
  156.     "#229",     "a",
  157.     "Aring",    "A",    # capital A, ring
  158.     "#197",     "A",
  159.     "atilde",   "a",    # small a, tilde
  160.     "#227",     "a",
  161.     "Atilde",   "A",    # capital A, tilde
  162.     "#195",     "A",
  163.     "auml",     "a",    # small a, dieresis or umlaut mark
  164.     "#228",     "a",
  165.     "Auml",     "A",    # capital A, dieresis or umlaut mark
  166.     "#196",     "A",
  167.     "aelig",    "ae",   # small ae diphthong (ligature)
  168.     "#230",     "ae",
  169.     "AElig",    "AE",   # capital AE diphthong (ligature)
  170.     "#198",     "AE",
  171.     "ccedil",   "c",    # small c, cedilla
  172.     "#231",     "c",
  173.     "Ccedil",   "C",    # capital C, cedilla
  174.     "#199",     "C",
  175.     "eth",      "d",    # small eth, Icelandic
  176.     "#240",     "d",
  177.     "ETH",      "d",    # capital Eth, Icelandic
  178.     "#208",     "d",
  179.     "eacute",   "e",    # small e, acute accent
  180.     "#233",     "e",
  181.     "Eacute",   "E",    # capital E, acute accent
  182.     "#201",     "E",
  183.     "ecirc",    "e",    # small e, circumflex accent
  184.     "#234",     "e",
  185.     "Ecirc",    "E",    # capital E, circumflex accent
  186.     "#202",     "E",
  187.     "egrave",   "e",    # small e, grave accent
  188.     "#232",     "e",
  189.     "Egrave",   "E",    # capital E, grave accent
  190.     "#200",     "E",
  191.     "euml",     "e",    # small e, dieresis or umlaut mark
  192.     "#235",     "e",
  193.     "Euml",     "E",    # capital E, dieresis or umlaut mark
  194.     "#203",     "E",
  195.     "iacute",   "i",    # small i, acute accent
  196.     "#237",     "i",
  197.     "Iacute",   "I",    # capital I, acute accent
  198.     "#205",     "I",
  199.     "icirc",    "i",    # small i, circumflex accent
  200.     "#238",     "i",
  201.     "Icirc",    "I",    # capital I, circumflex accent
  202.     "#206",     "I",
  203.     "igrave",   "i",    # small i, grave accent
  204.     "#236",     "i",
  205.     "Igrave",   "I",    # capital I, grave accent
  206.     "#204",     "I",
  207.     "iuml",     "i",    # small i, dieresis or umlaut mark
  208.     "#239",     "i",
  209.     "Iuml",     "I",    # capital I, dieresis or umlaut mark
  210.     "#207",     "I",
  211.     "ntilde",   "n",    # small n, tilde
  212.     "#241",     "n",
  213.     "Ntilde",   "N",    # capital N, tilde
  214.     "#209",     "N",
  215.     "oacute",   "o",    # small o, acute accent
  216.     "#243",     "o",
  217.     "Oacute",   "O",    # capital O, acute accent
  218.     "#211",     "O",
  219.     "ocirc",    "o",    # small o, circumflex accent
  220.     "#244",     "o",
  221.     "Ocirc",    "O",    # capital O, circumflex accent
  222.     "#212",     "O",
  223.     "ograve",   "o",    # small o, grave accent
  224.     "#242",     "o",
  225.     "Ograve",   "O",    # capital O, grave accent
  226.     "#210",     "O",
  227.     "oslash",   "o",    # small o, slash
  228.     "#248",     "o",
  229.     "Oslash",   "O",    # capital O, slash
  230.     "#216",     "O",
  231.     "otilde",   "o",    # small o, tilde
  232.     "#245",     "o",
  233.     "Otilde",   "O",    # capital O, tilde
  234.     "#213",     "O",
  235.     "ouml",     "o",    # small o, dieresis or umlaut mark
  236.     "#246",     "o",
  237.     "Ouml",     "O",    # capital O, dieresis or umlaut mark
  238.     "#214",     "O",
  239.     "szlig",    "s",    # small sharp s, German (sz ligature)
  240.     "#223",     "s",
  241.     "thorn",    "p",    # small thorn, Icelandic
  242.     "#254",     "p",
  243.     "THORN",    "P",    # capital THORN, Icelandic
  244.     "#222",     "P",
  245.     "uacute",   "u",    # small u, acute accent
  246.     "#250",     "u",
  247.     "Uacute",   "U",    # capital U, acute accent
  248.     "#218",     "U",
  249.     "ucirc",    "u",    # small u, circumflex accent
  250.     "#251",     "u",
  251.     "Ucirc",    "U",    # capital U, circumflex accent
  252.     "#219",     "U",
  253.     "ugrave",   "u",    # small u, grave accent
  254.     "#249",     "u",
  255.     "Ugrave",   "U",    # capital U, grave accent
  256.     "#217",     "U",
  257.     "uuml",     "u",    # small u, dieresis or umlaut mark
  258.     "#252",     "u",
  259.     "Uuml",     "U",    # capital U, dieresis or umlaut mark
  260.     "#220",     "U",
  261.     "yacute",   "y",    # small y, acute accent
  262.     "#253",     "y",
  263.     "Yacute",   "Y",    # capital Y, acute accent
  264.     "#221",     "Y",
  265.     "yuml",     "y",    # small y, dieresis or umlaut mark
  266.     "#255",     "y",
  267. );
  268. ##---------------------------------------------------------------------------##
  269. ##------------##
  270. ## Begin MAIN ##
  271. ##------------##
  272. {
  273.  
  274. local(@array);
  275.  
  276. $HTML = 1  if "@ARGV" =~ /-html/;
  277.  
  278. &SGMLread_sgml(STDIN, *array);
  279. foreach (@array) {
  280.     if ($HTML) {
  281.         if (/^<A\s/i) {                 # Check for URL
  282.             if (/href\s*=\s*['"]([^'"]+)['"]/i) {
  283.                 $url = $1;
  284.             } else {
  285.                 $url = '';
  286.             }
  287.         } elsif (m%^</A\s*>%i) {        # Output URL at anchor end
  288.             print STDOUT " <URL:$url>"  if $url;
  289.         }
  290.     }
  291.     next if /^</;
  292.     s/\&([#\w-._]+);/$Entity{$1}/g;
  293.     print STDOUT $_;
  294. }
  295. exit 0;
  296.  
  297. }
  298. ##----------##
  299. ## End MAIN ##
  300. ##----------##
  301. ##---------------------------------------------------------------------------##

Raw Paste


Login or Register to edit or fork this paste. It's free.