PYTHON 12
Seo.py Guest on 4th October 2020 03:33:12 AM
  1. # - * - coding: utf-8
  2.  
  3. import  re
  4. import  string
  5. import  urllib
  6. import  urllib2
  7.  
  8. # checking the link to the input field
  9. def  checkinp ( url ) :
  10.     if  url [ : 7 ]  ! =  'http: //' :
  11.         url =  'http: //'  + url
  12.     try :
  13.         page =  urllib . urlopen ( url ) . read ( )
  14.     except :
  15.         return  False
  16.     else :
  17.         pass
  18.     return  '<textarea'  in  page
  19.  
  20. # main domain of the page
  21. def  host ( url ) :
  22.     if  url [ : 7 ]  ==  'http: //' :
  23. url = url [ 7 : ]
  24.     url =  string . split ( url,  '/' ) [ 0 ]
  25.     return  'http: //' + url
  26.  
  27. # search in Yandex
  28. # page - number of the search page
  29. # numdoc - number of links on the page (20, 50)
  30. def  ypages( text, page = 0 , numdoc = 10 , results = [ ] ) :
  31.     host =  " yandex.ru/yandsearch "
  32.     headers =  {  'User-Agent'  :  'Opera / 9.80 (Windows NT 5.1; U; ru) Presto / 2.2.15 Version / 10.00 ' ,
  33.     ' Host '  :  ' ya.ru ' ,
  34.     ' Accept '  :  ' text / html, application / xml; q = 0.9, application / xhtml + xml, image / png, image / jpeg, image / gif, image / x-xbitmap, * / *; q = 0.1 ' ,
  35.     ' Accept-Language '  :  ' ru-RU, ru; q = 0.9, en; q = 0.8 ' ,
  36.     ' Accept-Charset '  : 'Accept-Charset: iso-8859-1, utf-8, utf-16, *; q = 0.1' ,
  37.     'Referer'  :  'http://www2.amit.ru/forum/index.php' ,
  38.     'Connection '  :  ' Keep-Alive, TE ' ,
  39.     ' TE '  :  ' TE: deflate, gzip, chunked, identity, trailers'  }
  40.  
  41.     if  page  ! =  0 :
  42.      url =  urllib . urlencode ( { "p" :  str ( page ) ,  "text" : text,  "lr" : "77" ,  "numdoc" : str (numdoc ) } )
  43.     else :
  44.      url =  urllib . urlencode ( { "text" : text,  "lr" : "77" ,  "numdoc" : str ( numdoc ) } )
  45.     request = host + "?" + url
  46.     data =  urllib2 . urlopen ( request )
  47.  
  48.     txt = data. read ( )
  49.     links =  re . findall ( r 'href = "http: // ([^"] +)' , txt)
  50.     for  link  in  links:
  51.      if  'yandex'  not  in  link  and  link  not  in  results:
  52.          results. append ( link )
  53.     return  results
  54.  
  55. # remove frequently repeated expressions (in titles)
  56. def  del_strs ( title ) :
  57.     not_words =  [ '-recipes.' ,  'The quotes module.' ,  'The Citadel of Evil' ,  '.' ]
  58.     for  word  in  not_words:
  59. if word  in  title:
  60.             title =  string . replace ( title, word,  '' )
  61.     return  title
  62.  
  63. # get the title on the page
  64. def  get_title ( url ) :
  65.     page =  urllib . urlopen ( url ) . read ( )
  66.     titles =  re . findall ( r '<h2 class = "title"> ([^ <] +)' , page )
  67.     return  del_strs ( titles [ 0] )
  68.  
  69. # links from the sitemap
  70. def  sitemap_links ( url ) :
  71.     sitemap =  urllib . urlopen ( url ) . read ( )
  72.     urls =  re . findall ( r '([^ <] +) <', sitemap )
  73.     return  urls
  74.  
  75. # this and the following functions - to calculate pagerank
  76. def  get_pagerank ( url ) :
  77.     hsh = check_hash ( hash_url ( url ) )
  78.     gurl = 'http://www.google.com/search?client=navclient-auto&features=Rank:&q=info:%s&ch=%s'  %  ( urllib . quote ( url ) , hsh )
  79.     try :
  80.         f =  urllib . urlopen ( gurl )
  81.         rank = f. read ( ) . strip ( ) [ 9 : ]
  82.     except  Exception :
  83.         rank =  'N / A'
  84.     if  rank ==  '' :
  85.         rank =  '0'
  86.     return  rank
  87.  
  88. def   int_str ( string , integer, factor ) :
  89.     for  i  in  range ( len ( string ) )  :
  90.         integer  * = factor
  91.         integer  & = 0xFFFFFFFF
  92.         integer + =  ord ( string [ i ] )
  93.     return  integer
  94.  
  95. def  hash_url ( string ) :
  96.     c1 = int_str ( string , 0x1505, 0x21 )
  97.     c2 = int_str (string ,  0 , 0x1003F )
  98.     c1  >> =  2
  99.     c1 =  ( ( c1  >>  4 )  &  0x3FFFFC0 )  | ( c1  &  0x3F )
  100.     c1 =  ( ( c1  >>  4 )  &  0x3FFC00 )  | ( c1  &  0x3FF )
  101.     c1 =  ( ( c1  >>  4 )  &  0x3C000 )  | ( c1 &  0x3FFF )
  102.     t1 =  ( c1  &  0x3C0 )  <<  4
  103.     t1 | = c1  &  0x3C
  104.     t1 =  ( t1  <<  2 )  | ( c2  &  0xF0F )
  105.     t2 =  ( c1  &  0xFFFFC000 )  <<  4
  106.     t2 | = c1  &  0x3C00
  107.     t2 =  ( t2  <<  0xA )  | ( c2  &  0xF0F0000 )
  108.     return  (t1 | t2 )
  109.  
  110. def  check_hash ( hash_int ) :
  111.     hash_str =  '% u'  %  ( hash_int )
  112.     flag =  0
  113.     check_byte =  0
  114.     i =  len ( hash_str )  -  1
  115.     while  i  > =  0 :
  116.         byte =  int ( hash_str [ i ] )
  117.         if  1  = =  ( flag  %  2 ) :
  118.             byte  *=  2 ;
  119.             byte = byte /  10  + byte  %  10
  120.         check_byte + = byte
  121.         flag + =  1
  122.         i - =  1
  123.     check_byte  % =  10
  124.     if  0  ! = check_byte:
  125.         check_byte =  10  - check_byte
  126.         if  1  == flag  %  2 :
  127.             if  1  == check_byte  %  2 :
  128.                 check_byte + =  9
  129.             check_byte  >> =  1
  130.     return  '7' +  str ( check_byte )  + hash_str
  131.  
  132. And here is the main script that does all the action:
  133.  
  134. # - * - coding: utf-8
  135. import  seo
  136. import  urllib
  137.  
  138. # report file name (log)
  139. fname =  'result.csv'
  140. #
  141. sitemap address sitemap =  'http://toly-blog.ru/sitemap.xml'
  142. # function of writing a line to the log file
  143. def  log ( st ) :
  144.     global  fname
  145.     f =  open ( fname,  'a +' )
  146.     f. write ( st + ' \ n ' )
  147.     f. close ( )
  148.  
  149. # write the header of the table
  150. log ('pr_host; pr_page; url; link \ n ' )
  151.  
  152. # read site links from the sitemap
  153. urls = seo. sitemap_links ( sitemap )
  154. # for each page of the site
  155. for  url  in  urls:
  156.     print  url
  157.     # define the title of the page
  158.     title = seo. get_title ( url )
  159.     if  title ==  'Categories' :
  160. continue
  161.     print  title
  162.     # looking for a title in Yandex
  163.     links = seo. ypages ( title )
  164.     # for each page found
  165.     for  link in  links:
  166. # if there are input fields
  167. if  seo. checkinp ( link ) :
  168.             # define ee pagerank
  169.             pr_page =  str ( seo. get_pagerank ( link ) )
  170.     # and pagerank of its host
  171.     pr_host =  str ( seo. get_pagerank ( seo. host ( link ) ) )
  172.     # make out and write to the log
  173.     st = pr_host + ';' + pr_page + ';' + url + ';' + link
  174.     log (st )

Paste is for source code and general debugging text.

Login or Register to edit, delete and keep track of your pastes and more.

Raw Paste

Login or Register to edit or fork this paste. It's free.