- #! /usr/bin/env python
- ## -*- coding: utf-8 -*-
- """
- Created on Tue Sep 29 23:25:58 2015
- @author: siome
- """
- ## http://stackoverflow.com/questions/13200709/extract-google-scholar-results-using-python-or-r
- """
- I suggest you not to use specific libraries for crawling specific websites, but to use general purpose HTML libraries that are well tested and has well formed documentation such as BeautifulSoup.
- For accessing websites with a browser information, you could use an url opener class with a custom user agent:
- from urllib import FancyURLopener
- class MyOpener(FancyURLopener):
- version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
- openurl = MyOpener().open
- And then download the required url as follows:
- openurl(url).read()
- For retrieving scholar results just use http://scholar.google.se/scholar?hl=en&q=${query} url.
- To extract pieces of information from a retrieved HTML file, you could use this piece of code:
- from bs4 import SoupStrainer, BeautifulSoup
- page = BeautifulSoup(openurl(url).read(), parse_only=SoupStrainer('div', id='gs_ab_md'))
- This piece of code extracts a concrete div element that contains number of results shown in a Google Scholar search results page.
- """
- """
- s is the page, for example (s = f.read())
- gb = BeautifulSoup(s, parse_only=SoupStrainer('div', id='gsc_graph_bars'))
- or:
- soup = BeautifulSoup(html_doc, 'html.parser')
- gb = soup.find('div', id = 'gsc_graph_bars')
- for link in gb.find_all('a'):
- year = int(link.get('href')[-4:])
- cites = int(link.span.contents[0])
- print year, cites
- total citations string = <div class="gsc_value">
- Cited by xxx
- import re
- m = re.search('Cited by ([0-9]*)', s)
- int(m.group(1))
- """
- import copy
- import sys
- import codecs
- #import logging
- #logging.basicConfig()
- #from zope.publisher.browser import TestRequest
- import re
- import random
- from time import sleep
- from bibliograph.parsing.parsers.bibtex import BibtexParser
- from bs4 import BeautifulSoup
- from urllib import FancyURLopener
- class MyOpener(FancyURLopener):
- version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
- openurl = MyOpener().open
- areas = { 'cv':('Computer Vision', 'Images/Vision-100.png'),
- 'ai':('Artificial Intelligence', 'Images/AI-100.png'),
- 'forensics':('Forensics', 'Images/Forensics-80.png'),
- 'cg':('Computer Graphics', 'Images/Graphics-80.png'),
- 'asl':('Sign Language', 'Images/ASL-100.png'),
- 'wavelets':('Wavelets', 'Images/Wavelet-100.png'),
- 'sport': ('Sport', 'Images/Sport-80.png'),
- 'uncert': ('Uncertanties', 'Images/Uncert-100.png'),
- 'virtcrd': ('Virtual Crowds', 'Images/Crowd-80.png'),
- 'med': ('Biomedical', 'Images/Med-64.png'),
- 'biblio': ('Bibliometrics', 'Images/Book-80.png')
- }
- areas_keys = areas.keys()
- def parseBibtex( source ):
- # Bibtex parser
- parser = BibtexParser()
- # fix encoding
- #// source = parser.checkEncoding(source)
- # fix latex special characters, among other things
- source = parser.preprocess(source) #.encode('utf-8'))
- # get list of dictionaries
- return parser.getEntries(source)
- def grabCitations(ref):
- # test first if there is a gscholar page
- nref = copy.deepcopy(ref)
- nref[u'totalcites'] = '0'
- nref[u'citeshist'] = ''
- if 'gscholar' in nref:
- surl = ("https://scholar.google.com.br/"
- "citations?view_op=view_citation&hl=en&"
- "citation_for_view=%s") % nref['gscholar']
- nref['gscholarpage'] = openurl(surl).read()
- nref['soup'] = BeautifulSoup(nref['gscholarpage'], 'html.parser')
- gb = nref['soup'].find('div', id = 'gsc_graph_bars')
- cithist = ""
- nref[u'totalcites'] = '0'
- if not gb is None:
- for link in gb.find_all('a'):
- year = int(link.get('href')[-4:])
- cites = int(link.span.contents[0])
- cithist = cithist + "%d,%d;" % (year,cites)
- cithist = cithist[:-1]
- m = re.search('Cited by ([0-9]*)', nref['gscholarpage'])
- nref[u'totalcites'] = m.group(1)
- nref[u'citeshist'] = cithist
- return nref
- def grabAllCitations(bibs):
- # n = len(bibs)
- # print "Grabbing google citations: " + str(n) + " entries.\n"
- # this should work, but I want the printout and random wait
- # nbibs = [ grabCitations(b) for b in bibs ]
- # print "type of bibs: " + str(type(bibs))
- # print "size of bibs: " + str(len(bibs))
- # print "type of elements:"
- # for i in bibs:
- # print " "+str(type(i))
- nbibs = []
- i = 1
- for b in bibs:
- if i > 1:
- sleep(random.uniform(1.,6.))
- # print "Begining %s -> %d of %d" % (b['pid'], i, n)
- # print " antes type(b) = " + str(type(b))
- # for k,v in b.iteritems():
- # print " %s, %s" %(str(k), str(v))
- b2 = grabCitations(b)
- nbibs.append( b2 )
- i = i + 1
- # print "type of nbibs: " + str(type(nbibs))
- # print "size of nbibs: " + str(len(nbibs))
- # print "type of elements:"
- # for i in nbibs:
- # print " "+str(type(i))
- return nbibs
- def generateBiBTeX(refs,
- valid_fields=[u'title', u'editor', u'author', u'journal',
- u'booktitle', u'publisher', u'school', u'issn',
- u'volume', u'number', u'pages', u'year', u'doi',
- u'totalcites' ,u'citeshist', u'gscholar'],
- omit_fields=[]):
- s = ""
- # print "generateBiBTeX " + str(type(refs))
- # print refs
- for bib in refs:
- # print " " + bib
- s2 = toBiBTeX(bib,valid_fields,omit_fields)
- # print "\n\n " + s2
- s = s + s2 + "\n"
- return s
- def toBiBTeX (ref,
- valid_fields=[u'title', u'editor', u'author', u'journal', u'booktitle',
- u'publisher', u'school', u'issn', u'volume', u'number',
- u'pages', u'year', u'doi', u'pdf'],
- omit_fields=[]):
- # print "tobibtex " + str(type(ref))
- # print str(ref)
- omit = [each.lower() for each in omit_fields]
- fields = [each.lower() for each in valid_fields]
- ttable = [('&', r'\&'),
- ('~', r'\~'),
- ('—',r'---'),
- ('–', r'--'),
- ]
- bib_key = ref['pid']
- ref_type = ref['reference_type'].replace('Reference','')
- bibtex = u'\n@' + ref_type + u'{' + bib_key + u',\n' # '%s{%s,\n" % (ref_type, bib_key)
- ref_keys = ref.keys()
- for k in fields:
- if k in ref_keys and k not in omit:
- if type(ref[k]) == list:
- nv = ref[k][0]
- else:
- nv = ref[k]
- for a,b in ttable:
- nv = nv.replace(a,b)
- bibtex = bibtex + u' '
- bibtex = bibtex + k
- bibtex = bibtex + u' = {'
- # bibtex = bibtex + unicode(nv.encode('utf8'))
- # bibtex = bibtex + unicode(nv, encoding='utf8')
- bibtex = bibtex + unicode(nv, encoding='latin_1')
- bibtex = bibtex + u'},\n'
- # remove trailing command
- bibtex = bibtex[0:-2] + u"}\n"
- return bibtex
- if __name__ == "__main__":
- if len(sys.argv) != 2:
- print '\n' + sys.argv[0] + ': requires name of the bibtex file!\n'
- sys.exit()
- fname = sys.argv[1]
- bibfile = codecs.open(fname,encoding='latin_1').read()
- bibs = parseBibtex( bibfile )
- nbibs = grabAllCitations(bibs)
- print generateBiBTeX(nbibs,[u'title', u'editor', u'author', u'journal',
- u'booktitle', u'publisher', u'school', u'issn',
- u'volume', u'number', u'pages', u'year',
- u'doi', u'totalcites' ,u'citeshist'],
- []).encode('latin_1')
- # print generateBiBTeX(bibs).encode('latin_1')
Raw Paste