import sys from xml sax import make_parser handler import csv out csv

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import sys
from xml.sax import make_parser, handler
import csv
out = csv.writer(open('dmoz.csv', 'w'))
class Handler(handler.ContentHandler):
content = None
count = 0
def startElement(self, name, attrs):
if name == 'ExternalPage':
self.page = {'url': attrs['about']}
if name in ['d:Title', 'd:Description', 'topic']:
self.content = []
else:
self.content = None
def endElement(self, name):
if name == 'ExternalPage':
self.count += 1
self.save_page(self.page)
if name == 'd:Title':
self.page['title'] = ''.join(self.content)
if name == 'd:Description':
self.page['description'] = ''.join(self.content)
if name == 'topic':
self.page['topic'] = ''.join(self.content)
def characters(self, content):
if self.content is not None:
self.content.append(content)
def endDocument(self):
print 'Total records: %d' % self.count
def save_page(self, page):
print page['topic'], page['url']
keys = ['topic', 'url', 'title', 'description']
out.writerow([page[x].encode('utf-8') for x in keys])
if __name__ == '__main__':
parser = make_parser()
parser.setContentHandler(Handler())
# xml.sax._exceptions.SAXParseException: /var/lib/mysql/content.rdf.u8:27899230:155: reference to invalid character number
#parser.parse('test.rdf')
parser.parse('/var/lib/mysql/content.rdf.u8')
#from lxml.etree import iterparse
#for event, elem in iterparse(open('test.rdf')):
#print event, elem.tag