PYTHON   11

mybibtex py

Guest on 23rd August 2022 10:52:00 AM

  1. ############################################################################
  2. #                                                                          #
  3. #             copyright (c)  ITB, Humboldt-University Berlin           #
  4. #             written by: Raphael Ritz, r.ritz@biologie.hu-berlin.de       #
  5. #                                                                          #
  6. ############################################################################
  7.  
  8. """BibtexParser class"""
  9.  
  10. import os
  11. import re
  12.  
  13. from zope.component import getUtility, ComponentLookupError
  14.  
  15. from bibliograph.parsing.parsers.base import BibliographyParser
  16. from bibliograph.rendering.interfaces import IBibTransformUtility
  17.  
  18. from bibliograph.core.utils import _encode, _decode
  19. from bibliograph.core.bibutils import _hasCommands
  20. from bibliograph.core.encodings import _latex2utf8enc_mapping
  21. from bibliograph.core.encodings import _latex2utf8enc_mapping_simple
  22.  
  23.  
  24. _encoding = 'utf-8'   # XXX: should be taken from the site configuration
  25. haveBibUtils = _hasCommands('bib2xml')
  26. FIX_BIBTEX = os.environ.has_key('FIX_BIBTEX')
  27.  
  28. class BibtexParser(BibliographyParser):
  29.     """
  30.    A specific parser to process input in BiBTeX-format.
  31.    """
  32.  
  33.     meta_type = "Bibtex Parser"
  34.  
  35.     format = {'name':'BibTeX',
  36.               'extension':'bib'}
  37.  
  38.     def __init__(self,
  39.                  id = 'bibtex',
  40.                  title = "BibTeX parser",
  41.                  delimiter = '}\s*@',
  42.                  pattern = '(,\s*[\w\-]{2,}\s*=)'):
  43.         """
  44.        initializes including the regular expression patterns
  45.        """
  46.         self.id = id
  47.         self.title = title
  48.         self.setDelimiter(delimiter)
  49.         self.setPattern(pattern)
  50.  
  51.  
  52.     # Here we need to provide 'checkFormat' and 'parseEntry'
  53.     def checkFormat(self, source):
  54.         """
  55.        is this my format?
  56.        """
  57.         pattern = re.compile('^@[A-Z|a-z]*{', re.M)
  58.         all_tags = re.findall(pattern, source)
  59.  
  60.         if all_tags:
  61.             for t in all_tags:
  62.                 type = t.strip('@{').lower()
  63.                 if type not in ('article','book','booklet','conference','inbook','incollection',
  64.                                 'inproceedings','manual','mastersthesis','misc','phdthesis',
  65.                                 'proceedings','techreport','unpublished','collection','patent',
  66.                                 'webpublished'):
  67.                     return 0
  68.             return 1
  69.         else:
  70.             return 0
  71.  
  72.     def preprocess(self, source):
  73.         """
  74.        expands LaTeX macros
  75.        removes LaTeX commands and special formating
  76.        converts special characters to their HTML equivalents
  77.        """
  78.         source = self.expandMacros(source)
  79.  
  80.         # let Bibutils cleanup up the BibTeX mess
  81.         if FIX_BIBTEX and haveBibUtils:
  82.             try:
  83.                 tool = getUtility(IBibTransformUtility, name=u"external")
  84.                 source = tool.transform(source, 'bib', 'bib')
  85.             except ComponentLookupError:
  86.                 pass
  87.  
  88.         source = self.stripComments(source)
  89.         source = self.convertChars(source)
  90.         # it is important to convertChars before stripping off commands!!!
  91.         # thus, whatever command will be replaced by a unicode value... the
  92.         # remaining LaTeX commands will vanish here...
  93.         source = self.stripCommands(source)
  94.         return source
  95.  
  96.     def expandMacros(self, source):
  97.         source = self.expandStringMacros(source)
  98.         # add more macro conventions here if there are any
  99.         return source
  100.  
  101.     def expandStringMacros(self, source):
  102.         lines = source.split('\n')
  103.         macros = []
  104.         sourcelns = []
  105.         for line in lines:
  106.             if line.find('@String') > -1:
  107.                 macros.append(line)
  108.             else:
  109.                 sourcelns.append(line)
  110.         source = '\n'.join(sourcelns)
  111.         for macro in macros:
  112.             split_on = re.compile('[{=}]+')
  113.             raw_matches = split_on.split(macro)
  114.             matches = [m for m in raw_matches if m not in ['', ' ', '\r']]
  115.             # raise str(matches)
  116.             short = matches[1].strip()
  117.             long = matches[-1].strip()
  118.             pattern = "\\b" + short + "\\b"
  119.             old = re.compile(pattern)
  120.             source = old.sub(long, source)
  121.         return source
  122.  
  123.     def stripCommands(self, source):
  124.         oldstyle_cmd = re.compile(r'{\\[a-zA-Z]{2,}')
  125.         newstyle_cmd = re.compile(r'\\[a-zA-Z]+{')
  126.         source = oldstyle_cmd.sub('{', source)
  127.         source = newstyle_cmd.sub('{', source)
  128.         return source
  129.  
  130.     def stripComments(self, source):
  131.  
  132.         inside_entry = False
  133.         waiting_for_first_brace = False
  134.         newsource = ''
  135.  
  136.         for idx in range(len(source)):
  137.  
  138.             char = source[idx]
  139.             last_char = (idx > 0) and source[idx-1] or '\n'
  140.             next_char = (idx < len(source)-1) and source[idx+1] or '\n'
  141.  
  142.             if char == '@' and not inside_entry:
  143.                 inside_entry = True
  144.                 waiting_for_first_brace = True
  145.                 braces_nesting_level = 0
  146.  
  147.             if inside_entry:
  148.  
  149.                 newsource = newsource + char
  150.                 if char == '{' and last_char != "\\":
  151.                     braces_nesting_level += 1
  152.  
  153.                 if char == '}' and last_char != "\\":
  154.                     braces_nesting_level -= 1
  155.  
  156.                 if waiting_for_first_brace and (braces_nesting_level == 1):
  157.                     waiting_for_first_brace = False
  158.  
  159.                 if (braces_nesting_level == 0) and not waiting_for_first_brace and (char == '}'):
  160.                     inside_entry = False
  161.                     newsource = newsource + "\n"
  162.  
  163.         completely = re.compile('.*')
  164.         ## this line caused issue #20, leaving it here for now... mg-20061023
  165.         # source = completely.sub(newsource, 'dummy')
  166.         ## this line fixes issue #20
  167.         source = newsource
  168.         return source
  169.  
  170.     def convertChars(self, source):
  171.         source = self.convertLaTeX2Unicode(source)
  172.         source = self.fixWhiteSpace(source)
  173.         return self.explicitReplacements(source)
  174.  
  175.     def convertLaTeX2Unicode(self, source):
  176.         for latex_entity in _latex2utf8enc_mapping_simple.keys():
  177.             source = _encode(_decode(source).replace(latex_entity, _latex2utf8enc_mapping_simple[latex_entity]))
  178.  
  179.         for latex_entity in _latex2utf8enc_mapping.keys():
  180.             source = _encode(_decode(source).replace(latex_entity, _latex2utf8enc_mapping[latex_entity]))
  181.  
  182.         return source
  183.  
  184.     def fixWhiteSpace(self, source):
  185.         ttable = [(r'\ ', ' '),
  186.                   (r'\!', ' '),
  187.                   ]
  188.         source = self.mreplace(source, ttable)
  189.         wsp_tilde = re.compile(r'[^/\\]~')
  190.         return wsp_tilde.sub(self.tilde2wsp, source).replace('\~', '~')
  191.  
  192.     def tilde2wsp(self, hit):
  193.         return hit.group(0)[0] + ' '
  194.  
  195.     def explicitReplacements(self, source):
  196.         # list of 2 tuples; second element replaces first
  197.         ttable = [(r'\/', ''),
  198.                   (r'\&', '&'),
  199.                   (r'\~', '~'),
  200.                   (r'---', '&mdash;'),
  201.                   (r'--', '&ndash;'),
  202.                   ]
  203.         return self.mreplace(source, ttable)
  204.  
  205.     def mreplace(self, s, ttable):
  206.         for a, b in ttable:
  207.             s = s.replace(a, b)
  208.         return s
  209.  
  210.     # done with preprocessing
  211.  
  212.     def parseEntry(self, entry):
  213.         """
  214.        parses a single entry
  215.  
  216.        returns a dictionary to be passed to
  217.        BibliographyEntry's edit method
  218.        """
  219.         result = {}
  220.         authorlist = []
  221.         authorURLlist = []
  222.  
  223.         # remove newlines and <CR>s, and remove the last '}'
  224.         entry = entry.replace('\n', ' ').replace('\r', '').replace('\t', ' ').rstrip().rstrip('}')
  225.         tokens = self.pattern.split(entry)
  226.         try:
  227.             type, pid = tokens[0].strip().split('{')
  228.             type = type.replace('@', '').strip().lower()
  229.             result['reference_type'] = type.capitalize() + 'Reference'
  230.             result['pid'] = pid.replace(',', '').strip()
  231.         except:
  232.             return "Bibtex Parser Error: malformed first line."
  233.  
  234.         for k,v in self.group(tokens[1:],2):
  235.             key = k[1:-1].strip().lower()
  236.  
  237.             ###  This I remove in my local copy            
  238.             # INBOOKs mapping: title -> booktitle, chapter -> chapter and title
  239.             # if type == 'inbook':
  240.             #     if key == 'title':
  241.             #         key = 'booktitle'
  242.  
  243.             #     if key == 'chapter':
  244.             #         result['title'] = self.cleanLine(v)
  245.  
  246.             # BibTex field "type" maps to CMFBAT field "publication_type"
  247.             if key == 'type':
  248.                 key = 'publication_type'
  249.                 result[key] = self.cleanLine(v)
  250.  
  251.             # special procedure for authors and editors
  252.             elif key == 'author':
  253.                 if result.has_key('author'):
  254.                     result[key].append(self.cleanLine(v))
  255.                 else:
  256.                     result[key] = [ self.cleanLine(v) ]
  257.             elif (key == 'editor') and (type in ['book','proceedings']):
  258.                 if result.has_key('editor'):
  259.                     result[key].append(self.cleanLine(v))
  260.                 else:
  261.                     result[key] = [self.cleanLine(v)]
  262.             elif key == 'keywords':
  263.                 if not result.has_key(key):
  264.                     # Original BibTeX files contain only *one* 'keywords = '
  265.                     # for multiple keywords
  266.                     result[key] = self.splitMultiple(v)
  267.                 else:
  268.                     # This is likely used by other importers/parser trying to mis-use
  269.                     # the BibTeX importer with multiple keywords
  270.                     result[key].append(self.cleanLine(v))
  271.             else:
  272.                 value = self.cleanLine(v)
  273.                 result[key] = value
  274.                 # Aliasing the value to an upper-cased key so that when this dictionary
  275.                 # is passed into <a_reference_object>.edit(**dictionary), the values
  276.                 # will match and auto-update schema fields that are specified in either
  277.                 # upper or lower case.  This is motivated by the 'DOI' field being upper-cased.
  278.                 # Of course, this won't help mixed-case fields, but we'd probably need to change
  279.                 # Archetype internals to fix that - and that would be a questionable endeavour.
  280.                 result[key.upper()] = value
  281.  
  282.             #print key, result[key]
  283.  
  284.         # compile authors list of dictionaries
  285.         # we can have authors
  286.         if result.has_key('author'):
  287.             for each in result['author']:
  288.                 each = each.replace(' AND', ' and')
  289.                 authorlist.extend( each.split(' and') )
  290.         # but for some bibref types we can have editors alternatively
  291.         elif result.has_key('editor') and (type in ['book','proceedings']):
  292.             result['editor_flag'] = True
  293.             for each in result['editor']:
  294.                 each = each.replace(' AND', ' and')
  295.                 authorlist.extend( each.split(' and') )
  296.         if result.has_key('authorURLs'):
  297.             authorURLlist = result['authorURLs'].split('and ')
  298.  
  299.         if authorlist:
  300.             alist = []
  301.             authorlist = [x for x in authorlist if x]
  302.             for author in authorlist:
  303.                 fname = mname = lname = ''
  304.                 parts = self.splitAuthor(author)
  305.                 if len(parts) == 1:
  306.                     lname = parts[0].strip()
  307.                 else:
  308.                     lname = parts[-1].strip()
  309.                     fname = parts[0].strip()
  310.                     if parts[1:-1]:
  311.                         mname = ' '.join([_ for _ in parts[1:-1]])
  312.                 adict = {'firstname': fname,
  313.                          'middlename': mname,
  314.                          'lastname': lname}
  315.                 alist.append(adict)
  316.  
  317.         if authorURLlist and alist:
  318.             index = 0
  319.             for url in authorURLlist:
  320.                 alist[index]['homepage'] = url.strip()
  321.                 index += 1
  322.  
  323.         if authorlist:
  324.             result['authors'] = alist
  325.  
  326.         # do some renaming and reformatting
  327.         tmp = result.get('note')
  328.         while tmp and tmp[-1] in ['}', ',', '\n', '\r']:
  329.             tmp = tmp[:-1]
  330.         if tmp:
  331.             result['note'] = tmp
  332.         result['publication_year'] = result.get('year', '')
  333.         result['publication_month'] = result.get('month', '')
  334.         result['publication_url'] = result.get('url', '')
  335.         ## result['publication_title'] = result.get('title', '')
  336.         tmp = result.get('title','')
  337.         for car in ('\n', '\r', '\t'):
  338.             tmp = tmp.replace(car, ' ')
  339.         while '  ' in tmp:
  340.             tmp = tmp.replace('  ', ' ')
  341.         result['title'] = tmp
  342.  
  343.         # collect identifiers
  344.         identifiers = list()
  345.         for key in ('isbn', 'doi', 'asin', 'purl', 'urn', 'issn'):
  346.             if key in result:
  347.                 identifiers.append({'label' : key.upper(), 'value': result[key]})
  348.         if identifiers:
  349.             result['identifiers'] = identifiers
  350.  
  351.         return result
  352.  
  353.     # the helper method's
  354.  
  355.     def splitAuthor(self, author=None):
  356.         if not author:
  357.             return []
  358.         #parts = author.replace('.', ' ').split(',',1)
  359.         parts = author.split(',',1)
  360.         if len(parts) == 1:
  361.             return parts[0].split()
  362.         else:
  363.             tmp = parts[1].split()
  364.             tmp.append(parts[0])
  365.             return tmp
  366.  
  367.     def splitMultiple(self, value):
  368.         value = self.clean(value)
  369.         result = list()
  370.         for item in value.split(','):
  371.             item = item.strip()
  372.             if item:
  373.                 result.append(item)
  374.         return result
  375.  
  376.     def clean(self, value):
  377.         value = value.replace('{', '').replace('}', '').strip()
  378.         if value and value[0] == '"' and len(value) > 1:
  379.             value = value[1:-1]
  380.         return value
  381.  
  382.     def cleanLine(self, value):
  383.         return self.clean(value.rstrip().rstrip(',').rstrip())
  384.  
  385.     def group(self, p,n):
  386.         """ Group a sequence p into a list of n tuples."""
  387.         mlen, lft = divmod(len(p), n)
  388.         if lft != 0:
  389.             mlen += 1
  390.  
  391.         # initialize a list of suitable length
  392.         lst = [[None]*n for i in range(mlen)]
  393.  
  394.         # Loop over all items in the input sequence
  395.         for i in range(len(p)):
  396.             j,k = divmod(i,n)
  397.             lst[j][k] = p[i]
  398.  
  399.         return map(tuple, lst)

Raw Paste


Login or Register to edit or fork this paste. It's free.