- #! /usr/bin/env python
- ## -*- coding: utf-8 -*-
- """
- Created on Tue Sep 29 23:25:58
- @author: siome
- """
- ## http://stackoverflow.com/questions/13200709/extract-google-scholar-results-using-python-or-r
- """
- I suggest you not to use specific libraries for crawling specific websites, but to use general purpose HTML libraries that are well tested and has well formed documentation such as BeautifulSoup.
- For accessing websites with a browser information, you could use an url opener class with a custom user agent:
- from urllib import FancyURLopener
- class MyOpener(FancyURLopener):
- version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
- openurl = MyOpener().open
- And then download the required url as follows:
- openurl(url).read()
- For retrieving scholar results just use http://scholar.google.se/scholar?hl=en&q=${query} url.
- To extract pieces of information from a retrieved HTML file, you could use this piece of code:
- from bs4 import SoupStrainer, BeautifulSoup
- page = BeautifulSoup(openurl(url).read(), parse_only=SoupStrainer('div', id='gs_ab_md'))
- This piece of code extracts a concrete div element that contains number of results shown in a Google Scholar search results page.
- """
- """
- s is the page, for example (s = f.read())
- gb = BeautifulSoup(s, parse_only=SoupStrainer('div', id='gsc_graph_bars'))
- or:
- soup = BeautifulSoup(html_doc, 'html.parser')
- gb = soup.find('div', id = 'gsc_graph_bars')
- for link in gb.find_all('a'):
- year = int(link.get('href')[-4:])
- cites = int(link.span.contents[0])
- print year, cites
- total citations string = <div class="gsc_value">
- Cited by xxx
- import re
- m = re.search('Cited by ([0-9]*)', s)
- int(m.group(1))
- """
- import copy
- import sys
- import codecs
- #import logging
- #logging.basicConfig()
- import bibtexparser
- from bibtexparser.bparser import BibTexParser
- import bibtexparser.customization
- #from zope.publisher.browser import TestRequest
- import re
- import random
- from time import sleep
- from bibliograph.parsing.parsers.bibtex import BibtexParser
- from bs4 import BeautifulSoup
- from urllib import FancyURLopener
- class MyOpener(FancyURLopener):
- version = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.152 Safari/537.36'
- openurl = MyOpener().open
- areas = { u'cv':(u'Computer Vision', u'Images/Vision-100.png'),
- u'ai':(u'Artificial Intelligence', u'Images/AI-100.png'),
- u'ml':(u'Machine Learning', u'Images/ML-80.png'),
- u'forensics':(u'Forensics', u'Images/Forensics-80.png'),
- u'access':(u'Accessibility', u'Images/Access-80.png'),
- u'cg':(u'Computer Graphics', u'Images/Graphics-80.png'),
- u'asl':(u'Sign Language', u'Images/ASL-100.png'),
- u'wavelets':(u'Wavelets', u'Images/Wavelet-100.png'),
- u'sport': (u'Sport', u'Images/Sport-80.png'),
- u'uncert': (u'Uncertanties', u'Images/Uncert-100.png'),
- u'virtcrd': (u'Virtual Crowds', u'Images/Crowd-80.png'),
- u'med': (u'Biomedical', u'Images/Med-64.png'),
- u'biblio': (u'Bibliometrics', u'Images/Book-80.png')
- }
- areas_keys = areas.keys()
- def parseBibtex( bibtex_fname ):
- with open(bibtex_fname) as bibtex_file:
- bibtex_database = bibtexparser.load(bibtex_file)
- # get list of dictionaries
- return bibtex_database.entries
- def grabCitations(ref):
- # test first if there is a gscholar page
- nref = copy.deepcopy(ref)
- nref[u'totalcites'] = '0'
- nref[u'citeshist'] = ''
- if 'gscholar' in nref:
- surl = ("https://scholar.google.com.br/"
- "citations?view_op=view_citation&hl=en&"
- "citation_for_view=%s") % nref['gscholar']
- nref[u'gscholarpage'] = openurl(surl).read()
- nref[u'soup'] = BeautifulSoup(nref['gscholarpage'], 'html.parser')
- gb = nref[u'soup'].find('div', id = 'gsc_graph_bars')
- cithist = ""
- nref[u'totalcites'] = '0'
- if not gb is None:
- for link in gb.find_all('a'):
- year = int(link.get('href')[-4:])
- cites = int(link.span.contents[0])
- cithist = cithist + "%d,%d;" % (year,cites)
- cithist = cithist[:-1]
- m = re.search('Cited by ([0-9]*)', nref['gscholarpage'])
- nref[u'totalcites'] = m.group(1)
- nref[u'citeshist'] = cithist
- return nref
- def grabAllCitations(bibs):
- nbibs = []
- i = 1
- for b in bibs:
- if i > 1:
- sleep(random.uniform(1.,6.))
- # print "Begining %s -> %d of %d" % (b['pid'], i, n)
- # print " antes type(b) = " + str(type(b))
- # for k,v in b.iteritems():
- # print " %s, %s" %(str(k), str(v))
- b2 = grabCitations(b)
- nbibs.append( b2 )
- i = i + 1
- return nbibs
- def generateBiBTeX(refs,
- valid_fields=[u'title', u'editor', u'author', u'journal',
- u'booktitle', u'publisher', u'school', u'issn',
- u'volume', u'number', u'pages', u'year', u'doi',
- u'totalcites' ,u'citeshist', u'gscholar'],
- omit_fields=[]):
- s = ""
- for bib in refs:
- s2 = toBiBTeX(bib,valid_fields,omit_fields)
- s = s + s2 + "\n"
- return s
- def toBiBTeX (ref,
- valid_fields=[u'title', u'editor', u'author', u'journal', u'booktitle',
- u'publisher', u'school', u'issn', u'volume', u'number',
- u'pages', u'year', u'doi', u'pdf'],
- omit_fields=[]):
- # print "tobibtex " + str(type(ref))
- # print str(ref)
- omit = [each.lower() for each in omit_fields]
- fields = [each.lower() for each in valid_fields]
- # ttable = [('&', r'\&'),
- # ('~', r'\~'),
- # ('—',r'---'),
- # ('–', r'--'),
- # ]
- bib_key = ref[u'ID']
- ref_type = ref[u'ENTRYTYPE'].replace(u'Reference','')
- bibtex = u'\n@' + ref_type + u'{' + bib_key + u',\n' # '%s{%s,\n" % (ref_type, bib_key)
- ref_keys = ref.keys()
- for k in fields:
- if k in ref_keys and k not in omit:
- if type(ref[k]) == list:
- nv = ref[k][0]
- else:
- nv = ref[k]
- # for a,b in ttable:
- # nv = nv.replace(a,b)
- bibtex = bibtex + u' %s = {%s},\n' % (k,nv)
- # bibtex = bibtex + u' '
- # bibtex = bibtex + k
- # bibtex = bibtex + u' = {'
- # # bibtex = bibtex + unicode(nv.encode('utf8'))
- # # bibtex = bibtex + unicode(nv, encoding='utf8')
- # bibtex = bibtex + nv
- # bibtex = bibtex + u'},\n'
- # remove trailing command
- bibtex = bibtex[0:-2] + u"}\n"
- return bibtex
- if __name__ == "__main__":
- if len(sys.argv) != 3:
- print '\n typical usage:'
- print ' "%s" input output\n' % sys.argv[0]
- print ' input - input bibtex file'
- print ' output - output bibtex file'
- sys.exit()
- fname = sys.argv[1]
- fnout = sys.argv[2]
- # bibfile = codecs.open(fname,encoding='latin_1').read()
- # bibs = parseBibtex( bibfile )
- bibs = parseBibtex( fname )
- nbibs = grabAllCitations(bibs)
- so = generateBiBTeX(nbibs,[u'title', u'editor', u'author', u'journal',
- u'booktitle', u'publisher', u'school', u'issn',
- u'volume', u'number', u'pages', u'year',
- u'doi', u'totalcites' ,u'citeshist',
- u'gscholar', u'pdf', u'abstract'],
- [])
- with open(fnout,'w') as fo:
- fo.write(so)
- # print generateBiBTeX(bibs).encode('latin_1')
Raw Paste