usr bin env python import itertools try import cElementTree as ET exce

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
import itertools
try:
import cElementTree as ET
except ImportError:
from elementtree import ElementTree as ET
import db
GEO_NS = 'http://www.geonames.org/ontology#'
RDF_NS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
#FNAME = '.geodata/test.txt'
FNAME = '.geodata/all-geonames-rdf.txt'
def process_record(uri, xml):
"""
Parse record from geonames dump.
"""
elem = ET.fromstring(xml)
pages = {}
for art in elem.findall('*/{%s}wikipediaArticle' % GEO_NS):
link = art.get('{%s}resource' % RDF_NS)
lang = link[7:].split('.', 1)[0]
pages[lang] = link
id = int(uri.rstrip('/').rsplit('/', 1)[1])
return {'id': id, 'wikipedia': pages}
def parse_file(fname):
"""
Parse geonames RDF dump.
"""
record = []
for line in open(fname):
record.append(line.strip())
if len(record) == 2:
yield process_record(*record)
record = []
def update():
cursor = db.cursor()
cursor.execute("""
SELECT geoname_id
FROM cities""")
geoname_ids = [x[0] for x in cursor.fetchall()]
insert_count = itertools.count(1)
for count, item in enumerate(parse_file(FNAME)):
if count and not count % 10000:
print 'step', count
#print item['id'], item['wikipedia']
if item['wikipedia']:
if item['id'] in geoname_ids:
ru_link = item['wikipedia'].get('ru')
en_link = item['wikipedia'].get('en')
if ru_link or en_link:
#cursor.execute("""
#UPDATE cities
#SET wikipedia_ru = %s,
#wikipedia_en = %s
#WHERE geoname_id = %s""",
#(ru_link, en_link, item['id']))
print item['id'], item['wikipedia'].get('en')
#if insert_count.next() % 100:
#cursor.execute("COMMIT")
if __name__ == '__main__':
update()