Copyright 2007 Thomas Lotze See also LICENSE txt Requesting URL page r

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Copyright (c) 2007 Thomas Lotze
# See also LICENSE.txt
"""Requesting a URL's page rank from Google and parsing the response.
The hash algorithm and query string assembly have been implemented after the
WWW::Google::PageRank Perl module by Yuri Karaban, who says he took the
knowledge from the pagerankstatus Mozilla extension in turn.
"""
import urllib
import re
import time
HOST = "toolbarqueries.google.com"
def _cutoff32(value):
"""Map the value to a 32-bit positive integer.
"""
return value % 0x100000000
def _le_encode(value):
"""Encode an integer into 4 bytes in little-endian order.
"""
value = _cutoff32(value)
return [value >> 8*i & 0xff for i in (0, 1, 2, 3)]
def _le_decode(value):
"""Decode 4 bytes in little-endian order into an integer.
"""
return sum(c << 8*i for i, c in enumerate(value[:4]))
def _mix(a, b, c):
"""Transform an integer triple in an irrversible (?) way.
"""
c = _cutoff32(c)
a = _cutoff32(a-b-c) ^ c >> 13
b = _cutoff32(b-c-a) ^ _cutoff32(a << 8)
c = _cutoff32(c-a-b) ^ b >> 13
a = _cutoff32(a-b-c) ^ c >> 12
b = _cutoff32(b-c-a) ^ _cutoff32(a << 16)
c = _cutoff32(c-a-b) ^ b >> 5
a = _cutoff32(a-b-c) ^ c >> 3
b = _cutoff32(b-c-a) ^ _cutoff32(a << 10)
c = _cutoff32(c-a-b) ^ b >> 15
return a, b, c
def _checksum(value):
"""Reduce a sequence of integers to a hash value.
"""
a, b, c = 0x9e3779b9, 0x9e3779b9, 0xe6359a60
index = 0
while index <= len(value)-12:
a, b, c = _mix(
a + _le_decode(value[index:index+4]),
b + _le_decode(value[index+4:index+8]),
c + _le_decode(value[index+8:index+12]))
index += 12
a, b, c = _mix(
a + _le_decode(value[index:index+4]),
b + _le_decode(value[index+4:index+8]),
c + (_le_decode(value[index+8:])<<8) + len(value))
return c
def checksum(value):
"""Double-fold a sequence of integers into a hash value.
"""
ch = _checksum([ord(c) for c in value])
ch = ((ch % 0x0d) & 7) | ((ch/7) << 2)
return _checksum(sum((_le_encode(ch-9*i) for i in xrange(20)), []))
def query_url(target):
"""Compose a query URL for target containing a computed checksum.
target: str, a URL
returns str
"""
query = "info:"+target
params = urllib.urlencode({
"client": "navclient-auto",
"ch": "6%s" % checksum(query),
"ie": "UTF-8",
"oe": "UTF-8",
"features": "Rank",
"q": query,
})
return "http://%s/search?%s" % (HOST, params)
def read_rank(response):
"""Read the pagerank from Google's response.
response: str, HTTP response body
returns str or raises ValueError if a pagerank wasn't found
"""
groups = re.findall("^Rank_\d+:\d+:(\d+)$", response.strip())
if len(groups) == 1:
return groups[0]
else:
raise ValueError
# And now begins my code :-)
from libpy.parser.parser2 import Parser2
class GooglePR(Parser2):
def _get(self, url, sleep=(2, 4)):
if not url.startswith('http://'):
url = 'http://%s' % url
data = self.fetch(query_url(url))
try:
pr = int(read_rank(data))
except ValueError:
pr = 0
self.sleep(sleep)
return pr