PYTHON   45
bib2csv py
Guest on 23rd August 2022 10:49:04 AM


  1. #! /usr/bin/env python
  2. ## -*- coding: utf-8 -*-
  3. """
  4. Created on Tue Sep 29 23:25:58
  5.  
  6. @author: siome
  7. """
  8.  
  9. ## http://stackoverflow.com/questions/13200709/extract-google-scholar-results-using-python-or-r
  10.  
  11. """
  12.  
  13. I suggest you not to use specific libraries for crawling specific websites, but to use general purpose HTML libraries that are well tested and has well formed documentation such as BeautifulSoup.
  14.  
  15. For accessing websites with a browser information, you could use an url opener class with a custom user agent:
  16.  
  17. from urllib import FancyURLopener
  18. class MyOpener(FancyURLopener):
  19.    version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
  20. openurl = MyOpener().open
  21.  
  22. And then download the required url as follows:
  23.  
  24. openurl(url).read()
  25.  
  26. For retrieving scholar results just use http://scholar.google.se/scholar?hl=en&q=${query} url.
  27.  
  28. To extract pieces of information from a retrieved HTML file, you could use this piece of code:
  29.  
  30. from bs4 import SoupStrainer, BeautifulSoup
  31. page = BeautifulSoup(openurl(url).read(), parse_only=SoupStrainer('div', id='gs_ab_md'))
  32.  
  33. This piece of code extracts a concrete div element that contains number of results shown in a Google Scholar search results page.
  34. """
  35.  
  36. """
  37.  
  38. s is the page, for example (s = f.read())
  39.  
  40. gb = BeautifulSoup(s, parse_only=SoupStrainer('div', id='gsc_graph_bars'))
  41.  
  42. or:
  43. soup = BeautifulSoup(html_doc, 'html.parser')
  44. gb = soup.find('div', id = 'gsc_graph_bars')
  45.  
  46.  
  47. for link in gb.find_all('a'):
  48.    year = int(link.get('href')[-4:])
  49.    cites = int(link.span.contents[0])
  50.    print year, cites
  51.  
  52.  
  53.  
  54. total citations string = <div class="gsc_value">
  55. Cited by xxx
  56.  
  57. import re
  58. m = re.search('Cited by ([0-9]*)', s)
  59. int(m.group(1))
  60.  
  61.  
  62.  
  63. """
  64.  
  65.  
  66. import copy
  67. import sys
  68. import codecs
  69. #import logging
  70. #logging.basicConfig()
  71.  
  72.  
  73. import bibtexparser
  74. from   bibtexparser.bparser import BibTexParser
  75. import bibtexparser.customization
  76.  
  77. #from zope.publisher.browser import TestRequest
  78.  
  79. import re
  80. import random
  81. from time import sleep
  82.  
  83.  
  84. #from bibliograph.parsing.parsers.bibtex import BibtexParser
  85. #from bs4 import BeautifulSoup
  86.  
  87. #from urllib import FancyURLopener
  88. #class MyOpener(FancyURLopener):
  89. #    version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
  90. #openurl = MyOpener().open
  91.  
  92.  
  93. def parseBibtex( bibtex_fname ):
  94.   bp = BibTexParser()
  95.   bp.ignore_nonstandard_types = False
  96.  
  97.   with open(bibtex_fname) as bibtex_file:
  98.     bibtex_database = bp.parse(bibtex_file.read())
  99.  
  100.   # get list of dictionaries
  101.   return bibtex_database.entries
  102.  
  103.  
  104.  
  105.  
  106. def generateCSV(refs,
  107.             valid_fields=[u'title', u'editor', u'author', u'journal',
  108.                           u'booktitle', u'publisher', u'school', u'issn',
  109.                           u'volume', u'number', u'pages', u'year', u'doi',
  110.                           u'totalcites' ,u'citeshist', u'gscholar'],
  111.               omit_fields=[]):
  112.  
  113.     s = 'label,type'
  114.     for l in  [each.lower().encode('latin_1') for each in valid_fields]:
  115.         s = s + ',' + l
  116.     s = s + '\n'
  117.  
  118.  
  119.     ct = 1
  120.     for bib in refs:
  121.         s2 =    toCSV(bib,valid_fields,omit_fields)
  122.         s  =    s + s2 + '\n'
  123.         ct = ct+1
  124.  
  125.     return s
  126.  
  127.  
  128. def str2CSVHTML (s):
  129.     # removes duplicated and trailing and preceeding spaces,
  130.     # also tabs and newlines
  131.     nv = u' '.join(s.split())
  132.     # changes commas, quotes, etc for html codes to avoid
  133.     # problems with csv encodings
  134.  
  135.     # percent
  136.     nv = nv.replace(u'\%', u'&#25;')
  137.     nv = nv.replace(u'%', u'&#25;')
  138.     # semi colon
  139.     nv = nv.replace(u';', u'&#59;')
  140.     # quotation mark
  141.     nv = nv.replace(u'"', u'&quot;')
  142.     # comma
  143.     nv = nv.replace(u',', u'&#44;')
  144.     # emdash and endash
  145.     nv = nv.replace(u'---', u'&mdash;')
  146.     nv = nv.replace(u'--', u'&ndash;')
  147.  
  148.     return nv
  149.  
  150.  
  151. def toCSV (ref,
  152.             valid_fields=[u'title', u'editor', u'author', u'journal',
  153.             u'booktitle', u'publisher', u'school', u'issn',
  154.             u'volume', u'number', u'pages', u'year', u'doi',
  155.             u'totalcites' ,u'citeshist', u'gscholar'],
  156.             omit_fields=[]):
  157.  
  158.     fields = [each.lower() for each in valid_fields]
  159.  
  160.     if u'bibtex' in fields and u'bibtex' not in ref.keys():
  161.         bt = toBibtexHTML(ref,
  162.                             [ u'title', u'editor', u'author',
  163.                               u'journal', u'booktitle', u'publisher',
  164.                               u'school', u'issn', u'volume', u'number',
  165.                               u'pages', u'year'],
  166.                             omit_fields)
  167.         ref[u'bibtex'] = u' '.join(bt.split())
  168.  
  169.  
  170.     bib_key = ref[u'ID']
  171.     ref_type = ref[u'ENTRYTYPE'].replace(u'Reference','')
  172.  
  173.  
  174.     s = bib_key.strip() + u',' + ref_type.strip()
  175.  
  176.     ref_keys = ref.keys()
  177.  
  178.     for k in fields:
  179.  
  180.         # this guarantees that even if this bibtex entry does not
  181.         # have the field, it will generate an 'empty' entry.
  182.         # somehow ruby/jekyll does not like an fully empty,
  183.         # so a non-existing field is just a ' ' white space.
  184.         s = s + u','
  185.  
  186.         if k in ref_keys:
  187.             if type(ref[k]) == list:
  188.                 nv = ref[k][0]
  189.             else:
  190.                 nv = ref[k]
  191.  
  192.             if k == u'keywords':
  193.                 nv = nv.replace(u',', '')
  194.             elif k == u'author':
  195.                 nv = u' '.join(nv.split())
  196.             elif k != u'bibtex':
  197.                 nv = str2CSVHTML(nv)
  198.  
  199.  
  200.             s = s + nv
  201.         else:
  202.             # this is the white space of the non-existing field
  203.             s = s + u' '
  204.  
  205.     return s
  206.  
  207. def toBibtex (ref,
  208.             valid_fields=[u'title', u'editor', u'author', u'journal', u'booktitle',
  209.                           u'publisher', u'school', u'issn', u'volume', u'number',
  210.                           u'pages', u'year', u'doi', u'pdf'],
  211.               omit_fields=[]):
  212.  
  213.     omit   = [each.lower() for each in omit_fields]
  214.     fields = [each.lower() for each in valid_fields]
  215.  
  216.     bib_key = ref[u'ID']
  217.     ref_type = ref[u'ENTRYTYPE'].replace(u'Reference','')
  218.     bibtex = u'\n@' + ref_type + u'{' + bib_key + u',\n' # '%s{%s,\n" %  (ref_type, bib_key)
  219.  
  220.     ref_keys = ref.keys()
  221.  
  222.     for k in fields:
  223.         if k in ref_keys and k not in omit:
  224.             if type(ref[k]) == list:
  225.                 nv = ref[k][0]
  226.             else:
  227.                 nv = ref[k]
  228.  
  229.             bibtex = bibtex + u'  '
  230.             bibtex = bibtex + k
  231.             bibtex = bibtex + u' = {'
  232.             bibtex = bibtex + nv
  233.             bibtex = bibtex + u'},\n'
  234.  
  235.     # remove trailing command
  236.     bibtex = bibtex[0:-2] + u"}\n"
  237.  
  238.     return bibtex
  239.  
  240. def toBibtexHTML (ref,
  241.             valid_fields=[u'title', u'editor', u'author', u'journal', u'booktitle',
  242.                           u'publisher', u'school', u'issn', u'volume', u'number',
  243.                           u'pages', u'year', u'doi', u'pdf'],
  244.               omit_fields=[]):
  245.  
  246.     omit   = [each.lower() for each in omit_fields]
  247.     fields = [each.lower() for each in valid_fields]
  248.  
  249.     bib_key = ref[u'ID']
  250.     ref_type = ref[u'ENTRYTYPE'].replace(u'Reference','')
  251.     bibtex = u'<br />@' + ref_type + u'{' + bib_key
  252.  
  253.     ref_keys = ref.keys()
  254.  
  255.     for k in fields:
  256.         if k in ref_keys and k not in omit:
  257.             if type(ref[k]) == list:
  258.                 nv = ref[k][0]
  259.             else:
  260.                 nv = ref[k]
  261.  
  262.             bibtex = bibtex + u'&#44;<br />'
  263.             bibtex = bibtex + u'&nbsp;&nbsp;&nbsp;&nbsp;'
  264.             bibtex = bibtex + k
  265.             bibtex = bibtex + u' = {'
  266.             bibtex = bibtex + nv
  267.             bibtex = bibtex + u'}'
  268.  
  269.     # remove trailing command
  270.     bibtex = bibtex + u"}<br />"
  271.  
  272.     return bibtex
  273.  
  274.  
  275.  
  276. if __name__ == "__main__":
  277.     if len(sys.argv) != 3:
  278.         print '\n typical usage:'
  279.         print '  "%s" input output\n' % sys.argv[0]
  280.         print '     input  - input bibtex file'
  281.         print '     output - output bibtex file'
  282.         sys.exit()
  283.     fname = sys.argv[1]
  284.     fnout  = sys.argv[2]
  285.     bibs = parseBibtex( fname )
  286.  
  287.  
  288.     so =  generateCSV( bibs,   [u'title', u'editor', u'author', u'journal',
  289.                                 u'booktitle', u'publisher', u'school', u'issn',
  290.                                 u'volume', u'number', u'pages', u'year',
  291.                                 u'doi', u'totalcites' ,u'citeshist',
  292.                                 u'gscholar', u'keywords', u'pdf', u'abstract',
  293.                                 u'bibtex'],
  294.                                 [])
  295.  
  296.  
  297.  
  298.     with open(fnout,'w') as fo:
  299.         fo.write(so.encode('utf-8'))
  300.  
  301. #    print generateBiBTeX(bibs).encode('latin_1')

Raw Paste

Login or Register to edit or fork this paste. It's free.