PYTHON   43
new bib citation parse py
Guest on 23rd August 2022 10:53:11 AM


  1. #! /usr/bin/env python
  2. ## -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Sep 29 23:25:58
  5.  
  6. @author: siome
  7. """
  8.  
  9. ## http://stackoverflow.com/questions/13200709/extract-google-scholar-results-using-python-or-r
  10.  
  11. """    
  12.  
  13. I suggest you not to use specific libraries for crawling specific websites, but to use general purpose HTML libraries that are well tested and has well formed documentation such as BeautifulSoup.
  14.  
  15. For accessing websites with a browser information, you could use an url opener class with a custom user agent:
  16.  
  17. from urllib import FancyURLopener
  18. class MyOpener(FancyURLopener):
  19.    version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
  20. openurl = MyOpener().open
  21.  
  22. And then download the required url as follows:
  23.  
  24. openurl(url).read()
  25.  
  26. For retrieving scholar results just use http://scholar.google.se/scholar?hl=en&q=${query} url.
  27.  
  28. To extract pieces of information from a retrieved HTML file, you could use this piece of code:
  29.  
  30. from bs4 import SoupStrainer, BeautifulSoup
  31. page = BeautifulSoup(openurl(url).read(), parse_only=SoupStrainer('div', id='gs_ab_md'))
  32.  
  33. This piece of code extracts a concrete div element that contains number of results shown in a Google Scholar search results page.
  34. """
  35.  
  36. """
  37.  
  38. s is the page, for example (s = f.read())
  39.  
  40. gb = BeautifulSoup(s, parse_only=SoupStrainer('div', id='gsc_graph_bars'))
  41.  
  42. or:
  43. soup = BeautifulSoup(html_doc, 'html.parser')
  44. gb = soup.find('div', id = 'gsc_graph_bars')
  45.  
  46.  
  47. for link in gb.find_all('a'):
  48.    year = int(link.get('href')[-4:])
  49.    cites = int(link.span.contents[0])
  50.    print year, cites
  51.  
  52.  
  53.  
  54. total citations string = <div class="gsc_value">
  55. Cited by xxx
  56.  
  57. import re
  58. m = re.search('Cited by ([0-9]*)', s)
  59. int(m.group(1))
  60.  
  61.  
  62.  
  63. """
  64.  
  65.  
  66. import copy
  67. import sys
  68. import codecs
  69. #import logging
  70. #logging.basicConfig()
  71.  
  72.  
  73. import bibtexparser
  74. from   bibtexparser.bparser import BibTexParser
  75. import bibtexparser.customization
  76.  
  77. #from zope.publisher.browser import TestRequest
  78.  
  79. import re
  80. import random
  81. from time import sleep
  82.  
  83.  
  84. from bibliograph.parsing.parsers.bibtex import BibtexParser
  85. from bs4 import BeautifulSoup
  86.  
  87. from urllib import FancyURLopener
  88. class MyOpener(FancyURLopener):
  89.     version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
  90. openurl = MyOpener().open
  91.  
  92. areas = { u'cv':(u'Computer Vision', u'Images/Vision-100.png'),
  93.           u'ai':(u'Artificial Intelligence', u'Images/AI-100.png'),
  94.           u'ml':(u'Machine Learning', u'Images/ML-80.png'),
  95.           u'forensics':(u'Forensics', u'Images/Forensics-80.png'),
  96.           u'access':(u'Accessibility', u'Images/Access-80.png'),
  97.           u'cg':(u'Computer Graphics', u'Images/Graphics-80.png'),
  98.           u'asl':(u'Sign Language', u'Images/ASL-100.png'),
  99.           u'wavelets':(u'Wavelets', u'Images/Wavelet-100.png'),
  100.           u'sport': (u'Sport', u'Images/Sport-80.png'),
  101.           u'uncert': (u'Uncertanties', u'Images/Uncert-100.png'),
  102.           u'virtcrd': (u'Virtual Crowds', u'Images/Crowd-80.png'),
  103.           u'med': (u'Biomedical', u'Images/Med-64.png'),
  104.           u'biblio': (u'Bibliometrics', u'Images/Book-80.png')
  105.           }
  106.  
  107. areas_keys = areas.keys()
  108.  
  109.  
  110. def parseBibtex( bibtex_fname ):
  111.  
  112.  
  113.     with open(bibtex_fname) as bibtex_file:
  114.         bibtex_database = bibtexparser.load(bibtex_file)
  115.  
  116.     # get list of dictionaries
  117.     return bibtex_database.entries
  118.  
  119.  
  120. def grabCitations(ref):
  121.  
  122.     #  test first if there is a gscholar page
  123.  
  124.     nref = copy.deepcopy(ref)
  125.  
  126.  
  127.     nref[u'totalcites'] = '0'
  128.     nref[u'citeshist'] = ''
  129.  
  130.     if 'gscholar' in nref:
  131.  
  132.         surl = ("https://scholar.google.com.br/"
  133.                 "citations?view_op=view_citation&hl=en&"
  134.                 "citation_for_view=%s") % nref['gscholar']
  135.  
  136.         nref[u'gscholarpage'] = openurl(surl).read()
  137.    
  138.         nref[u'soup'] =  BeautifulSoup(nref['gscholarpage'], 'html.parser')
  139.         gb = nref[u'soup'].find('div', id = 'gsc_graph_bars')
  140.  
  141.         cithist = ""
  142.         nref[u'totalcites'] = '0'
  143.    
  144.         if not gb is None:
  145.             for link in gb.find_all('a'):
  146.                 year = int(link.get('href')[-4:])
  147.                 cites = int(link.span.contents[0])
  148.                 cithist = cithist + "%d,%d;" % (year,cites)
  149.    
  150.             cithist = cithist[:-1]
  151.             m = re.search('Cited by ([0-9]*)', nref['gscholarpage'])
  152.             nref[u'totalcites'] = m.group(1)
  153.  
  154.         nref[u'citeshist'] = cithist
  155.  
  156.    
  157.     return nref
  158.  
  159.  
  160.  
  161. def grabAllCitations(bibs):
  162.  
  163.     nbibs = []
  164.     i = 1
  165.     for b in bibs:
  166.         if i > 1:
  167.             sleep(random.uniform(1.,6.))
  168.  
  169. #        print "Begining %s -> %d of %d" % (b['pid'], i, n)
  170.  
  171. #        print "  antes type(b) = " + str(type(b))
  172. #        for k,v in b.iteritems():
  173. #            print "    %s, %s" %(str(k), str(v))
  174.  
  175.         b2 = grabCitations(b)
  176.  
  177.         nbibs.append( b2 )
  178.         i = i + 1
  179.  
  180.  
  181.     return nbibs
  182.  
  183.  
  184. def generateBiBTeX(refs,
  185.             valid_fields=[u'title', u'editor', u'author', u'journal',
  186.                           u'booktitle', u'publisher', u'school', u'issn',
  187.                           u'volume', u'number', u'pages', u'year', u'doi',
  188.                           u'totalcites' ,u'citeshist', u'gscholar'],
  189.               omit_fields=[]):
  190.  
  191.     s = ""
  192.        
  193.    
  194.     for bib in refs:
  195.         s2 =    toBiBTeX(bib,valid_fields,omit_fields)            
  196.         s  =    s + s2 + "\n"
  197.  
  198.     return s
  199.  
  200.  
  201. def toBiBTeX (ref,
  202.             valid_fields=[u'title', u'editor', u'author', u'journal', u'booktitle',
  203.                           u'publisher', u'school', u'issn', u'volume', u'number',  
  204.                           u'pages', u'year', u'doi', u'pdf'],
  205.               omit_fields=[]):
  206.  
  207. #    print "tobibtex " + str(type(ref))
  208. #    print str(ref)
  209.  
  210.  
  211.     omit   = [each.lower() for each in omit_fields]
  212.     fields = [each.lower() for each in valid_fields]
  213.  
  214. #    ttable = [('&', r'\&'),
  215. #              ('~', r'\~'),
  216. #              ('&mdash;',r'---'),
  217. #              ('&ndash;', r'--'),
  218. #              ]
  219.  
  220.     bib_key = ref[u'ID']
  221.     ref_type = ref[u'ENTRYTYPE'].replace(u'Reference','')
  222.     bibtex = u'\n@' + ref_type + u'{' + bib_key + u',\n' # '%s{%s,\n" %  (ref_type, bib_key)
  223.  
  224.     ref_keys = ref.keys()
  225.  
  226.     for k in fields:
  227.         if k in ref_keys and k not in omit:
  228.             if type(ref[k]) == list:
  229.                 nv = ref[k][0]
  230.             else:
  231.                 nv = ref[k]
  232.  
  233. #            for a,b in ttable:
  234. #                nv = nv.replace(a,b)
  235.  
  236.             bibtex = bibtex + u'  %s = {%s},\n' % (k,nv)
  237. #            bibtex = bibtex + u'  '
  238. #            bibtex = bibtex + k
  239. #            bibtex = bibtex + u' = {'  
  240. #            #            bibtex = bibtex + unicode(nv.encode('utf8'))
  241. #            #            bibtex = bibtex + unicode(nv, encoding='utf8')
  242. #            bibtex = bibtex +  nv
  243. #            bibtex = bibtex + u'},\n'
  244.            
  245.     # remove trailing command
  246.     bibtex = bibtex[0:-2] + u"}\n"
  247.  
  248.  
  249.     return bibtex
  250.  
  251.  
  252. if __name__ == "__main__":
  253.     if len(sys.argv) != 3:
  254.         print '\n typical usage:'
  255.         print '  "%s" input output\n' % sys.argv[0]
  256.         print '     input  - input bibtex file'
  257.         print '     output - output bibtex file'
  258.         sys.exit()
  259.     fname = sys.argv[1]
  260.     fnout  = sys.argv[2]
  261. #    bibfile = codecs.open(fname,encoding='latin_1').read()
  262. #    bibs = parseBibtex( bibfile )
  263.     bibs = parseBibtex( fname )
  264.  
  265.     nbibs = grabAllCitations(bibs)
  266.  
  267.     so  = generateBiBTeX(nbibs,[u'title', u'editor', u'author', u'journal',
  268.                                 u'booktitle', u'publisher', u'school', u'issn',
  269.                                 u'volume', u'number', u'pages', u'year',
  270.                                 u'doi', u'totalcites' ,u'citeshist',
  271.                                 u'gscholar', u'pdf', u'abstract'],
  272.                                 [])
  273.    
  274.     with open(fnout,'w') as fo:
  275.         fo.write(so)
  276.  
  277. #    print generateBiBTeX(bibs).encode('latin_1')

Raw Paste

Login or Register to edit or fork this paste. It's free.