PYTHON   56
bib citation parse
Guest on 23rd August 2022 10:48:21 AM


  1. #! /usr/bin/env python
  2. ## -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Sep 29 23:25:58 2015
  5.  
  6. @author: siome
  7. """
  8.  
  9. ## http://stackoverflow.com/questions/13200709/extract-google-scholar-results-using-python-or-r
  10.  
  11. """    
  12.  
  13. I suggest you not to use specific libraries for crawling specific websites, but to use general purpose HTML libraries that are well tested and has well formed documentation such as BeautifulSoup.
  14.  
  15. For accessing websites with a browser information, you could use an url opener class with a custom user agent:
  16.  
  17. from urllib import FancyURLopener
  18. class MyOpener(FancyURLopener):
  19.    version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
  20. openurl = MyOpener().open
  21.  
  22. And then download the required url as follows:
  23.  
  24. openurl(url).read()
  25.  
  26. For retrieving scholar results just use http://scholar.google.se/scholar?hl=en&q=${query} url.
  27.  
  28. To extract pieces of information from a retrieved HTML file, you could use this piece of code:
  29.  
  30. from bs4 import SoupStrainer, BeautifulSoup
  31. page = BeautifulSoup(openurl(url).read(), parse_only=SoupStrainer('div', id='gs_ab_md'))
  32.  
  33. This piece of code extracts a concrete div element that contains number of results shown in a Google Scholar search results page.
  34. """
  35.  
  36. """
  37.  
  38. s is the page, for example (s = f.read())
  39.  
  40. gb = BeautifulSoup(s, parse_only=SoupStrainer('div', id='gsc_graph_bars'))
  41.  
  42. or:
  43. soup = BeautifulSoup(html_doc, 'html.parser')
  44. gb = soup.find('div', id = 'gsc_graph_bars')
  45.  
  46.  
  47. for link in gb.find_all('a'):
  48.    year = int(link.get('href')[-4:])
  49.    cites = int(link.span.contents[0])
  50.    print year, cites
  51.  
  52.  
  53.  
  54. total citations string = <div class="gsc_value">
  55. Cited by xxx
  56.  
  57. import re
  58. m = re.search('Cited by ([0-9]*)', s)
  59. int(m.group(1))
  60.  
  61.  
  62.  
  63. """
  64.  
  65.  
  66. import copy
  67. import sys
  68. import codecs
  69. #import logging
  70. #logging.basicConfig()
  71.  
  72. #from zope.publisher.browser import TestRequest
  73.  
  74. import re
  75. import random
  76. from time import sleep
  77.  
  78.  
  79. from bibliograph.parsing.parsers.bibtex import BibtexParser
  80. from bs4 import BeautifulSoup
  81.  
  82. from urllib import FancyURLopener
  83. class MyOpener(FancyURLopener):
  84.     version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
  85. openurl = MyOpener().open
  86.  
  87. areas = { 'cv':('Computer Vision', 'Images/Vision-100.png'),
  88.           'ai':('Artificial Intelligence', 'Images/AI-100.png'),
  89.           'forensics':('Forensics', 'Images/Forensics-80.png'),
  90.           'cg':('Computer Graphics', 'Images/Graphics-80.png'),
  91.           'asl':('Sign Language', 'Images/ASL-100.png'),
  92.           'wavelets':('Wavelets', 'Images/Wavelet-100.png'),
  93.           'sport': ('Sport', 'Images/Sport-80.png'),
  94.           'uncert': ('Uncertanties', 'Images/Uncert-100.png'),
  95.           'virtcrd': ('Virtual Crowds', 'Images/Crowd-80.png'),
  96.           'med': ('Biomedical', 'Images/Med-64.png'),
  97.           'biblio': ('Bibliometrics', 'Images/Book-80.png')
  98.           }
  99.  
  100. areas_keys = areas.keys()
  101.  
  102.  
  103. def parseBibtex( source ):
  104.  
  105.     # Bibtex parser
  106.     parser = BibtexParser()
  107.  
  108.     # fix encoding
  109.     #//    source = parser.checkEncoding(source)
  110.  
  111.     # fix latex special characters, among other things
  112.     source = parser.preprocess(source) #.encode('utf-8'))
  113.  
  114.     # get list of dictionaries
  115.     return parser.getEntries(source)
  116.  
  117.  
  118. def grabCitations(ref):
  119.  
  120.     #  test first if there is a gscholar page
  121.  
  122.     nref = copy.deepcopy(ref)
  123.  
  124.  
  125.     nref[u'totalcites'] = '0'
  126.     nref[u'citeshist'] = ''
  127.  
  128.     if 'gscholar' in nref:
  129.  
  130.         surl = ("https://scholar.google.com.br/"
  131.                 "citations?view_op=view_citation&hl=en&"
  132.                 "citation_for_view=%s") % nref['gscholar']
  133.  
  134.         nref['gscholarpage'] = openurl(surl).read()
  135.    
  136.         nref['soup'] =  BeautifulSoup(nref['gscholarpage'], 'html.parser')
  137.         gb = nref['soup'].find('div', id = 'gsc_graph_bars')
  138.  
  139.         cithist = ""
  140.         nref[u'totalcites'] = '0'
  141.    
  142.         if not gb is None:
  143.             for link in gb.find_all('a'):
  144.                 year = int(link.get('href')[-4:])
  145.                 cites = int(link.span.contents[0])
  146.                 cithist = cithist + "%d,%d;" % (year,cites)
  147.    
  148.             cithist = cithist[:-1]
  149.             m = re.search('Cited by ([0-9]*)', nref['gscholarpage'])
  150.             nref[u'totalcites'] = m.group(1)
  151.  
  152.         nref[u'citeshist'] = cithist
  153.  
  154.    
  155.     return nref
  156.  
  157.  
  158.  
  159. def grabAllCitations(bibs):
  160. #    n = len(bibs)
  161. #    print "Grabbing google citations:  " + str(n) + " entries.\n"
  162.  
  163.  
  164.     # this should work, but I want the printout and random wait
  165.     #  nbibs = [ grabCitations(b) for b in bibs ]
  166.  
  167. #    print "type of bibs:  " + str(type(bibs))
  168. #    print "size of bibs:  " + str(len(bibs))
  169. #    print "type of elements:"
  170. #    for i in bibs:
  171. #        print "  "+str(type(i))
  172.  
  173.     nbibs = []
  174.     i = 1
  175.     for b in bibs:
  176.         if i > 1:
  177.             sleep(random.uniform(1.,6.))
  178.  
  179. #        print "Begining %s -> %d of %d" % (b['pid'], i, n)
  180.  
  181. #        print "  antes type(b) = " + str(type(b))
  182. #        for k,v in b.iteritems():
  183. #            print "    %s, %s" %(str(k), str(v))
  184.  
  185.         b2 = grabCitations(b)
  186.  
  187.         nbibs.append( b2 )
  188.         i = i + 1
  189.  
  190.  
  191. #    print "type of nbibs:  " + str(type(nbibs))
  192. #    print "size of nbibs:  " + str(len(nbibs))
  193. #    print "type of elements:"
  194. #    for i in nbibs:
  195. #        print "  "+str(type(i))
  196.  
  197.     return nbibs
  198.  
  199.  
  200. def generateBiBTeX(refs,
  201.             valid_fields=[u'title', u'editor', u'author', u'journal',
  202.                           u'booktitle', u'publisher', u'school', u'issn',
  203.                           u'volume', u'number', u'pages', u'year', u'doi',
  204.                           u'totalcites' ,u'citeshist', u'gscholar'],
  205.               omit_fields=[]):
  206.  
  207.     s = ""
  208.    
  209. #    print "generateBiBTeX " + str(type(refs))
  210. #    print refs    
  211.    
  212.    
  213.     for bib in refs:
  214. #        print "   " + bib
  215.         s2 =    toBiBTeX(bib,valid_fields,omit_fields)    
  216. #        print "\n\n   " + s2        
  217.        
  218.         s = s + s2 + "\n"
  219.  
  220.     return s
  221.  
  222.  
  223. def toBiBTeX (ref,
  224.             valid_fields=[u'title', u'editor', u'author', u'journal', u'booktitle',
  225.                           u'publisher', u'school', u'issn', u'volume', u'number',  
  226.                           u'pages', u'year', u'doi', u'pdf'],
  227.               omit_fields=[]):
  228.  
  229. #    print "tobibtex " + str(type(ref))
  230. #    print str(ref)
  231.  
  232.  
  233.     omit   = [each.lower() for each in omit_fields]
  234.     fields = [each.lower() for each in valid_fields]
  235.  
  236.     ttable = [('&', r'\&'),
  237.               ('~', r'\~'),
  238.               ('&mdash;',r'---'),
  239.               ('&ndash;', r'--'),
  240.               ]
  241.  
  242.     bib_key = ref['pid']
  243.     ref_type = ref['reference_type'].replace('Reference','')
  244.     bibtex = u'\n@' + ref_type + u'{' + bib_key + u',\n' # '%s{%s,\n" %  (ref_type, bib_key)
  245.  
  246.     ref_keys = ref.keys()
  247.  
  248.     for k in fields:
  249.         if k in ref_keys and k not in omit:
  250.             if type(ref[k]) == list:
  251.                 nv = ref[k][0]
  252.             else:
  253.                 nv = ref[k]
  254.  
  255.             for a,b in ttable:
  256.                 nv = nv.replace(a,b)
  257.  
  258.             bibtex = bibtex + u'  '
  259.             bibtex = bibtex + k
  260.             bibtex = bibtex + u' = {'  
  261.             #            bibtex = bibtex + unicode(nv.encode('utf8'))
  262.             #            bibtex = bibtex + unicode(nv, encoding='utf8')
  263.             bibtex = bibtex +  unicode(nv, encoding='latin_1')
  264.             bibtex = bibtex + u'},\n'
  265.            
  266.     # remove trailing command
  267.     bibtex = bibtex[0:-2] + u"}\n"
  268.  
  269.  
  270.     return bibtex
  271.  
  272.  
  273. if __name__ == "__main__":
  274.     if len(sys.argv) != 2:
  275.         print '\n' +  sys.argv[0] + ': requires name of the bibtex file!\n'
  276.         sys.exit()
  277.     fname = sys.argv[1]
  278.     bibfile = codecs.open(fname,encoding='latin_1').read()
  279.     bibs = parseBibtex( bibfile )
  280.  
  281.     nbibs = grabAllCitations(bibs)
  282.    
  283.     print generateBiBTeX(nbibs,[u'title', u'editor', u'author', u'journal',
  284.                                 u'booktitle', u'publisher', u'school', u'issn',
  285.                                 u'volume', u'number', u'pages', u'year',
  286.                                 u'doi', u'totalcites' ,u'citeshist'],
  287.                                 []).encode('latin_1')
  288.  
  289. #    print generateBiBTeX(bibs).encode('latin_1')

Raw Paste

Login or Register to edit or fork this paste. It's free.