PYTHON   90

keyword scraper py

Guest on 21st August 2022 08:40:07 AM

  1. import re
  2. import os
  3.  
  4. def cmp_keywords(x,y):
  5.         '''
  6.         Sorts keywords by length, and then alphabetically
  7.         '''
  8.         if len(x) < len(y):
  9.                 return 1
  10.         elif len(x) == len(y):
  11.                 # Sort alphabetically
  12.                 if x == y:
  13.                         return 0
  14.                 elif x < y:
  15.                         return -1
  16.                 else:
  17.                         return 1
  18.         else:
  19.                 return -1
  20.  
  21. def keywords(infile, outdir):
  22.         '''
  23.         Scrapes comma separated keywords out of a file and sorts them in descending order of length.
  24.         It is assumed a keyword is surrounded in quotes ('' or ""), are grouped by commas and separated by line breaks.
  25.         The output is then printed and each group is written in text files in the given directory
  26.  
  27.         An example use case for this is scraping keywords out of GeSHi language files:
  28.  
  29.                 >>> keywords('geshi_lang_file.php', 'somedir')
  30.  
  31.         '''
  32.         if outdir and not os.path.exists(outdir):
  33.                 os.makedirs(outdir)
  34.  
  35.         f = open(infile, 'r')
  36.         fs = f.read()
  37.         fs = re.sub(r"(//.*?[\r\n])|(/\*.*?\*/)", '', fs)
  38.  
  39.         matches = re.findall(r"(?:(?:'[^']+'|\"[^\"]+\")(?:[ \t]*[\r\n]?[ \t]*,[ \t]*[\r\n]?[ \t]*)?(?!\s*=>)){2,}", fs, flags=re.I | re.M | re.S)
  40.         output = ''
  41.         group = 0
  42.         for i in matches:
  43.                 match = re.findall(r"'([^']+)'", i, flags=re.I | re.M | re.S)
  44.                 match.sort(cmp=cmp_keywords)
  45.                 suboutput = ''
  46.                 for m in match:
  47.                         m = m.strip()
  48.                         if len(m) > 0:
  49.                                 suboutput += m + '\n'
  50.                 suboutput += '\n'
  51.                 if outdir:
  52.                         w = open(outdir + '/' + str(group) + '.txt' , 'w')
  53.                         w.write(suboutput)
  54.                 output += suboutput
  55.                 group += 1;
  56.  
  57.         print output
  58.  
  59.         exit()
  60.         matches = re.findall(r"(['\"])(.*?)\1", fs, re.I | re.M | re.S)
  61.         output = ''
  62.         if len(matches):
  63.                 for m in matches:
  64.                         s = m[1].strip()
  65.                         if len(s) > 0:
  66.                                 output += s + '\n'
  67.         f.close()
  68.         print output
  69.         if w:
  70.                 w.write(output)
  71.                 w.close()

Raw Paste


Login or Register to edit or fork this paste. It's free.