coding utf-8 from xml etree import ElementTree as ET import re import

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
from xml.etree import ElementTree as ET
import re
import urllib
from datetime import datetime, timedelta
import time
import os
import fcntl
import cPickle as pickle
import gdbm
EXAMPLE_XML = """<?xml version="1.0" encoding="UTF-8"?><data><bot_ips><ip>77.91.226.131</ip><ip>77.91.226.130</ip><ip>86.111.22.107</ip><ip>88.208.19.9</ip><ip>77.91.226.135</ip><ip>77.91.226.133</ip><ip>77.91.226.151</ip><ip>206.161.193.2</ip><ip>207.226.178.210</ip><ip>77.91.226.180</ip><ip>77.91.228.36</ip><ip>77.91.226.136</ip></bot_ips><config><item name="after_text"><![CDATA[]]></item><item name="delimiter"><![CDATA[;]]></item><item name="before_text"><![CDATA[]]></item><item name="end"><![CDATA[<!--3a9f6b4e-->]]></item><item name="start"><![CDATA[<!--3a9f6b4e-->]]></item></config></data>"""
# user_id, host
URL = 'http://db.linkfeed.ru/%s/%s/UTF-8.xml'
def main():
import om.t
host = 'site.ru'
uri = '/'
uid = '****************************************'
db_file = '../linkfeed.gdbm'
tm = om.t.T()
for i in xrange(1):
ret = get_links(uid, host, uri, db_file=db_file)
tm.put()
#~ print ret.encode('utf-8')
def fetch_remote_database(uid, host):
url = URL % (uid, host)
data = urllib.urlopen(url).read()
return data
def fetch_database(uid, host, db_file=None):
"""
Load cached links and refresh them from sape.ru site if they is too old.
"""
if not os.path.exists(db_file):
# если файл не существует, создаём
try:
file(db_file, 'w').write('')
os.chmod(db_file, 0666)
except IOError:
raise Exception('Could not create %s' % db_file)
mtime = os.stat(db_file).st_mtime
check_time = time.time() - 3600
if mtime < check_time or not os.path.getsize(db_file):
# время обновить данные
data = fetch_remote_database(uid, host)
# парсим xml
tree = ET.fromstring(data)
bot_ips = [x.text for x in tree.findall('bot_ips/ip')]
config = {}
for elem in tree.findall('config/item'):
key = elem.get('name')
config[key] = elem.text or ''
pages = {}
for page_node in tree.findall('pages/page'):
page = pages.setdefault(page_node.get('url'), [])
for link_node in page_node.findall('link'):
page.append(link_node.text)
# сохраняем в gdbm
#~ os.unlink(db_file) # если мало изменений, лучше без этого
db = gdbm.open(db_file, 'cs')
db['_bot_ips_'] = pickle.dumps(bot_ips)
db['_config_'] = pickle.dumps(config)
for k, v in pages.iteritems():
db[k] = pickle.dumps(v)
# загружаем данные
db = gdbm.open(db_file)
bot_ips = pickle.loads(db['_bot_ips_'])
config = pickle.loads(db['_config_'])
return {'bot_ips': bot_ips, 'config': config, 'pages': db}
def get_links(uid, host, uri, db_file=None):
db = fetch_database(uid, host, db_file)
try:
links = db['pages'][uri]
except KeyError:
links = []
links = pickle.loads(links)
html = [db['config']['start']]
for link in links:
html.append(db['config']['before_text'])
html.append(link)
if links[-1] != link:
html.append(db['config']['delimiter'])
html.append(db['config']['after_text'])
html.append(db['config']['end'])
return ''.join(html)
if __name__ == "__main__":
main()