coding utf-8 from datetime import date import re from BeautifulSoup im

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
# -*- coding: utf-8 -*-
from datetime import date
import re
from BeautifulSoup import BeautifulSoup
LABELS_BASE = {
'sex': [u'Стать:', u'Пол:'],
'status': [u'Сiмейний стан:', u'Семейное положение:'],
'birth_date': [u'День народження:', u'День рождения:'],
'birth_city': [u'Рiдне мiсто:', u'Родной город:'],
'politic': [u'Полiт. погляди:', u'Полит. взгляды:'],
'religion': [u'Релiг. погляди:', u'Религ. взгляды:'],
'city': [u'Мiсто:', u'Город:'],
'phone_mobile': [u'Моб. телефон:', u'Моб. телефон:'],
'phone_home': [u'Дом. телефон:', u'Дом. телефон:'],
'icq': [u'ICQ:'],
}
LABELS_REVERSED = dict( (v, k) for k, v_list in LABELS_BASE.items() for v in v_list)
def parse_info_tag(name):
def parse_callback(soup):
pattern = re.compile(r'^/gsearch.php\?from=people&c\[%s\]=' % name)
tag = soup.findAll('a', href=pattern)[0]
id = int(re.search(r'=(\d+)$',tag['href']).groups()[0])
value = tag.string
return {'id': id,
'value': value,
}
return parse_callback
def parse_birth_date(soup):
bdate_href = soup.findAll('a', href=re.compile(r'^gsearch.php\?from=people&c\[bday\]='))[0]['href']
bday, bmonth = map(int, re.search(
r'\[bday\]=(\d+)&c\[bmonth\]=(\d+)', bdate_href).groups())
byear_href = soup.findAll('a', href=re.compile(r'^/gsearch.php\?from=people&c\[byear\]='))[0]['href']
byear = int(re.search(r'\[byear\]=(\d+)', byear_href).groups()[0])
birth_date = date(byear, bmonth, bday)
return birth_date
def parse_raw_data(soup):
return soup.string.strip()
def parse_city(soup):
href = soup.a['href']
city_id = int(re.search(r'c\[city\]=(\d+)', href).groups()[0])
city_name = soup.a.string
return {'id': city_id, 'name': city_name}
CALLBACKS = {
'sex': parse_info_tag('sex'),
'status': parse_info_tag('status'),
'birth_date': parse_birth_date,
'birth_city': lambda soup: soup.findAll('a', href=re.compile(r'f23='))[0].string,
'politic': parse_info_tag('politic'),
'religion': lambda soup: soup.findAll('a', href=re.compile(r'c\[religion\]='))[0].string,
'city': parse_city,
'phone_mobile': parse_raw_data,
'phone_home': parse_raw_data,
'icq': parse_raw_data,
}
def parse_profile_table(soup, labels=LABELS_REVERSED, callbacks=CALLBACKS):
trs = soup.findAll('tr')
res = {}
for tr in trs:
label = tr.td.string
data = tr.findAll('div', attrs={'class': 'dataWrap'})[0]
try:
res.update({label: callbacks[labels[label]](data)})
except KeyError:
print("Unknown label: %s" % label)
return res
def parse_account_info(soup):
profile_name = soup.findAll('div', attrs={'class': 'profileName'})[0].h2.string
return {'profile_name': profile_name}
def parse_personal_info(soup):
contacts_div, private_div = soup.findAll(attrs={'class': 'profileTable'})
contact_info = parse_contact_info(contacts_div)
private_info = parse_private_info(private_div)
res = contact_info
res.update(private_info)
return
class PersonInfoParser(object):
def __call__(self, page_contents):
soup = BeautifulSoup(page_contents)
account_info = parse_account_info(soup.findAll('div', attrs={'class': 'accountInfo clearFix'})[0])
basic_info = parse_profile_table(soup.findAll('div', attrs={'class': 'basicInfo'})[0].table)
contact_info_table, private_info_table = soup.findAll( 'div', id='personal')[0].findAll('table', attrs={'class': 'profileTable'})
contact_info = parse_profile_table(contact_info_table)
private_info_table = parse_profile_table(private_info_table)
import pdb;pdb.set_trace()