- #! /usr/bin/env python
- ## -*- coding: utf-8 -*-
- """
- Created on Tue Sep 29 23:25:58 2015
- @author: siome
- """
- ## http://stackoverflow.com/questions/13200709/extract-google-scholar-results-using-python-or-r
- """
- I suggest you not to use specific libraries for crawling specific websites, but to use general purpose HTML libraries that are well tested and has well formed documentation such as BeautifulSoup.
- For accessing websites with a browser information, you could use an url opener class with a custom user agent:
- from urllib import FancyURLopener
- class MyOpener(FancyURLopener):
- version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
- openurl = MyOpener().open
- And then download the required url as follows:
- openurl(url).read()
- For retrieving scholar results just use http://scholar.google.se/scholar?hl=en&q=${query} url.
- To extract pieces of information from a retrieved HTML file, you could use this piece of code:
- from bs4 import SoupStrainer, BeautifulSoup
- page = BeautifulSoup(openurl(url).read(), parse_only=SoupStrainer('div', id='gs_ab_md'))
- This piece of code extracts a concrete div element that contains number of results shown in a Google Scholar search results page.
- """
- """
- s is the page, for example (s = f.read())
- gb = BeautifulSoup(s, parse_only=SoupStrainer('div', id='gsc_graph_bars'))
- or:
- soup = BeautifulSoup(html_doc, 'html.parser')
- gb = soup.find('div', id = 'gsc_graph_bars')
- for link in gb.find_all('a'):
- year = int(link.get('href')[-4:])
- cites = int(link.span.contents[0])
- print year, cites
- total citations string = <div class="gsc_value">
- Cited by xxx
- import re
- m = re.search('Cited by ([0-9]*)', s)
- int(m.group(1))
- """
- import copy
- import sys
- import codecs
- #import logging
- #logging.basicConfig()
- import bibtexparser
- from bibtexparser.bparser import BibTexParser
- import bibtexparser.customization
- #from zope.publisher.browser import TestRequest
- import re
- import random
- from time import sleep
- #from bibliograph.parsing.parsers.bibtex import BibtexParser
- #from bs4 import BeautifulSoup
- #from urllib import FancyURLopener
- #class MyOpener(FancyURLopener):
- # version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
- #openurl = MyOpener().open
- def parseBibtex( bibtex_fname ):
- bp = BibTexParser()
- bp.ignore_nonstandard_types = False
- with open(bibtex_fname) as bibtex_file:
- bibtex_database = bp.parse(bibtex_file.read())
- # get list of dictionaries
- return bibtex_database.entries
- def generateCSV(refs,
- valid_fields=[u'title', u'editor', u'author', u'journal',
- u'booktitle', u'publisher', u'school', u'issn',
- u'volume', u'number', u'pages', u'year', u'doi',
- u'totalcites' ,u'citeshist', u'gscholar'],
- omit_fields=[]):
- s = 'label,type'
- for l in [each.lower().encode('latin_1') for each in valid_fields]:
- s = s + ',' + l
- s = s + '\n'
- ct = 1
- for bib in refs:
- s2 = toCSV(bib,valid_fields,omit_fields)
- s = s + s2 + '\n'
- ct = ct+1
- return s
- def str2CSVHTML (s):
- # removes duplicated and trailing and preceeding spaces,
- # also tabs and newlines
- nv = u' '.join(s.split())
- # changes commas, quotes, etc for html codes to avoid
- # problems with csv encodings
- # percent
- nv = nv.replace(u'\%', u'')
- nv = nv.replace(u'%', u'')
- # semi colon
- nv = nv.replace(u';', u';')
- # quotation mark
- nv = nv.replace(u'"', u'"')
- # comma
- nv = nv.replace(u',', u',')
- # emdash and endash
- nv = nv.replace(u'---', u'—')
- nv = nv.replace(u'--', u'–')
- return nv
- def toCSV (ref,
- valid_fields=[u'title', u'editor', u'author', u'journal',
- u'booktitle', u'publisher', u'school', u'issn',
- u'volume', u'number', u'pages', u'year', u'doi',
- u'totalcites' ,u'citeshist', u'gscholar'],
- omit_fields=[]):
- fields = [each.lower() for each in valid_fields]
- if u'bibtex' in fields and u'bibtex' not in ref.keys():
- bt = toBibtexHTML(ref,
- [ u'title', u'editor', u'author',
- u'journal', u'booktitle', u'publisher',
- u'school', u'issn', u'volume', u'number',
- u'pages', u'year'],
- omit_fields)
- ref[u'bibtex'] = u' '.join(bt.split())
- bib_key = ref[u'ID']
- ref_type = ref[u'ENTRYTYPE'].replace(u'Reference','')
- s = bib_key.strip() + u',' + ref_type.strip()
- ref_keys = ref.keys()
- for k in fields:
- # this guarantees that even if this bibtex entry does not
- # have the field, it will generate an 'empty' entry.
- # somehow ruby/jekyll does not like an fully empty,
- # so a non-existing field is just a ' ' white space.
- s = s + u','
- if k in ref_keys:
- if type(ref[k]) == list:
- nv = ref[k][0]
- else:
- nv = ref[k]
- if k == u'keywords':
- nv = nv.replace(u',', '')
- elif k == u'author':
- nv = u' '.join(nv.split())
- elif k != u'bibtex':
- nv = str2CSVHTML(nv)
- s = s + nv
- else:
- # this is the white space of the non-existing field
- s = s + u' '
- return s
- def toBibtex (ref,
- valid_fields=[u'title', u'editor', u'author', u'journal', u'booktitle',
- u'publisher', u'school', u'issn', u'volume', u'number',
- u'pages', u'year', u'doi', u'pdf'],
- omit_fields=[]):
- omit = [each.lower() for each in omit_fields]
- fields = [each.lower() for each in valid_fields]
- bib_key = ref[u'ID']
- ref_type = ref[u'ENTRYTYPE'].replace(u'Reference','')
- bibtex = u'\n@' + ref_type + u'{' + bib_key + u',\n' # '%s{%s,\n" % (ref_type, bib_key)
- ref_keys = ref.keys()
- for k in fields:
- if k in ref_keys and k not in omit:
- if type(ref[k]) == list:
- nv = ref[k][0]
- else:
- nv = ref[k]
- bibtex = bibtex + u' '
- bibtex = bibtex + k
- bibtex = bibtex + u' = {'
- bibtex = bibtex + nv
- bibtex = bibtex + u'},\n'
- # remove trailing command
- bibtex = bibtex[0:-2] + u"}\n"
- return bibtex
- def toBibtexHTML (ref,
- valid_fields=[u'title', u'editor', u'author', u'journal', u'booktitle',
- u'publisher', u'school', u'issn', u'volume', u'number',
- u'pages', u'year', u'doi', u'pdf'],
- omit_fields=[]):
- omit = [each.lower() for each in omit_fields]
- fields = [each.lower() for each in valid_fields]
- bib_key = ref[u'ID']
- ref_type = ref[u'ENTRYTYPE'].replace(u'Reference','')
- bibtex = u'<br />@' + ref_type + u'{' + bib_key
- ref_keys = ref.keys()
- for k in fields:
- if k in ref_keys and k not in omit:
- if type(ref[k]) == list:
- nv = ref[k][0]
- else:
- nv = ref[k]
- bibtex = bibtex + u',<br />'
- bibtex = bibtex + u' '
- bibtex = bibtex + k
- bibtex = bibtex + u' = {'
- bibtex = bibtex + nv
- bibtex = bibtex + u'}'
- # remove trailing command
- bibtex = bibtex + u"}<br />"
- return bibtex
- if __name__ == "__main__":
- if len(sys.argv) != 3:
- print '\n typical usage:'
- print ' "%s" input output\n' % sys.argv[0]
- print ' input - input bibtex file'
- print ' output - output bibtex file'
- sys.exit()
- fname = sys.argv[1]
- fnout = sys.argv[2]
- bibs = parseBibtex( fname )
- so = generateCSV( bibs, [u'title', u'editor', u'author', u'journal',
- u'booktitle', u'publisher', u'school', u'issn',
- u'volume', u'number', u'pages', u'year',
- u'doi', u'totalcites' ,u'citeshist',
- u'gscholar', u'keywords', u'pdf', u'abstract',
- u'bibtex'],
- [])
- with open(fnout,'w') as fo:
- fo.write(so.encode('utf-8'))
- # print generateBiBTeX(bibs).encode('latin_1')
Raw Paste