coding utf-8 import struct datetime decimal itertools sys def dbfreade

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#coding=utf-8
import struct, datetime, decimal, itertools,sys
def dbfreader(f):
"""Returns an iterator over records in a Xbase DBF file.
The first row returned contains the field names.
The second row contains field specs: (type, size, decimal places).
Subsequent rows contain the data records.
If a record is marked as deleted, it is skipped.
File should be opened for binary reads.
"""
# See DBF format spec at:
# http://www.pgts.com.au/download/public/xbase.htm#DBF_STRUCT
numrec, lenheader = struct.unpack('<xxxxLH22x', f.read(32))
numfields = (lenheader - 33) // 32
fields = []
for fieldno in xrange(numfields):
name, typ, size, deci = struct.unpack('<11sc4xBB14x', f.read(32))
name = name.replace('\0', '') # eliminate NULs from string
fields.append((name, typ, size, deci))
yield [field[0] for field in fields]
yield [tuple(field[1:]) for field in fields]
terminator = f.read(1)
assert terminator == '\r'
fields.insert(0, ('DeletionFlag', 'C', 1, 0))
fmt = ''.join(['%ds' % fieldinfo[2] for fieldinfo in fields])
fmtsiz = struct.calcsize(fmt)
for i in xrange(numrec):
record = struct.unpack(fmt, f.read(fmtsiz))
if record[0] != ' ':
continue # deleted record
result = []
for (name, typ, size, deci), value in itertools.izip(fields, record):
if name == 'DeletionFlag':
continue
if typ == "N":
value = value.replace('\0', '').lstrip()
if value == '':
value = 0
elif deci:
value = decimal.Decimal(value)
else:
value = int(value)
elif typ == 'D':
y, m, d = int(value[:4]), int(value[4:6]), int(value[6:8])
value = datetime.date(y, m, d)
elif typ == 'L':
value = (value in 'YyTt' and 'T') or (value in 'NnFf' and 'F') or '?'
result.append(value)
yield result
def dbfwriter(f, fieldnames, fieldspecs, records):
""" Return a string suitable for writing directly to a binary dbf file.
File f should be open for writing in a binary mode.
Fieldnames should be no longer than ten characters and not include \x00.
Fieldspecs are in the form (type, size, deci) where
type is one of:
C for ascii character data
M for ascii character memo data (real memo fields not supported)
D for datetime objects
N for ints or decimal objects
L for logical values 'T', 'F', or '?'
size is the field width
deci is the number of decimal places in the provided decimal object
Records can be an iterable over the records (sequences of field values).
"""
# header info
ver = 3
now = datetime.datetime.now()
yr, mon, day = now.year-1900, now.month, now.day
numrec = len(records)
numfields = len(fieldspecs)
lenheader = numfields * 32 + 33
lenrecord = sum(field[1] for field in fieldspecs) + 1
hdr = struct.pack('<BBBBLHH20x', ver, yr, mon, day, numrec, lenheader, lenrecord)
f.write(hdr)
# field specs
for name, (typ, size, deci) in itertools.izip(fieldnames, fieldspecs):
name = name.ljust(11, '\x00')
fld = struct.pack('<11sc4xBB14x', name, typ, size, deci)
f.write(fld)
# terminator
f.write('\r')
# records
for record in records:
f.write(' ') # deletion flag
for (typ, size, deci), value in itertools.izip(fieldspecs, record):
if typ == "N":
value = str(value).rjust(size, ' ')
elif typ == 'D':
value = value.strftime('%Y%m%d')
elif typ == 'L':
value = str(value)[0].upper()
else:
value = str(value)[:size].ljust(size, ' ')
assert len(value) == size
f.write(value)
# End of file
f.write('\x1A')
# -------------------------------------------------------
# Example calls
if __name__ == '__main__':
import sys, csv
from cStringIO import StringIO
from operator import itemgetter
from md5 import md5
# Read a database
filename = '/home/dmitri/PIndx10.DBF'
f = open(filename, 'rb')
db = list(dbfreader(f))
f.close()
out = open('/home/dmitri/out.sql', 'w')
area_exists = []
region_exists = []
city_exists = []
i = 0
for record in db[2:]:
i +=1
area = record[4].decode('cp866').strip().encode('utf-8')
region = record[6].decode('cp866').strip().encode('utf-8')
city = record[7].decode('cp866').strip().encode('utf-8')
area_md5 = md5(area).hexdigest()
if area_md5 not in area_exists and area:
area_exists.append(area_md5)
s = "INSERT INTO `tatsunew`.`general_city` VALUES ('', 'РОССИЯ', '%s', '', '', '3');\n" % area
out.write(s)
region_md5 = md5(area + " " + region).hexdigest()
if region_md5 not in region_exists and region and area:
region_exists.append(region_md5)
s = "INSERT INTO `tatsunew`.`general_city` VALUES ('', 'РОССИЯ', '%s', '%s', '', '2');\n" % (area, region)
out.write(s)
city_md5 = md5(area + " " + region + " " + city).hexdigest()
if city_md5 not in city_exists and city and area and region:
city_exists.append(city_md5)
s = "INSERT INTO `tatsunew`.`general_city` VALUES ('', 'РОССИЯ', '%s', '%s', '%s', '1');\n" % (area, region, city)
out.write(s)
if i % 500 == 0:
print i
out.write(s)
out.close()