визуализатор графа сайтов каталога nashomsk.ru

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from dja.setup_django import go
go(__file__, '/../..')
import yapgvb
from BeautifulSoup import BeautifulSoup
from urlparse import urlsplit
import pickle
from catalog.models import Site
parse = True
draw = True
map = {}
graph = yapgvb.Digraph('nashomsk.ru')
def get_links(domain, source):
print domain
try:
soup = BeautifulSoup(source)
except:
soup = BeautifulSoup('')
links = set([urlsplit(a.get('href', ''))[1] for a in soup.findAll('a')])
return links
def remove_trash(map):
blinks = set()
for domain, item in map.iteritems():
if domain in item['links']:
item['links'].remove(domain)
for link in item['links']:
blinks.add(link)
for domain, item in map.iteritems():
if not item['links'] and not domain in blinks:
item['remove'] = True
#elif item['links'] == [domain] and not domain in blinks:
#item['remove'] = True
for domain in map.keys():
if map[domain].get('remove'):
del map[domain]
if parse:
for site in Site.objects.all():
map[site.domain] = {'links': []}
count = 0
for site in Site.objects.all():
count += 1
#if count == 10:
#break
for domain in get_links(site.domain, site.index_source):
if domain in map:
map[site.domain]['links'].append(domain)
open('pub/graph-dump', 'w').write(pickle.dumps(map))
else:
map = pickle.load(open('pub/graph-dump'))
if draw:
remove_trash(map)
for domain, item in map.iteritems():
item['node'] = graph.add_node('%s' % str(domain),
fontname='data/arial.ttf',
fontsize=10, margin=0.1,
shape='ellipse')
for domain, item in map.iteritems():
for link in item['links']:
item['node'] >> map[link]['node']
#for layout in ['fdp', 'twopi', 'neato', 'circo']:
layout = 'circo'
graph.layout(getattr(yapgvb.engines, layout))
graph.render('pub/nashomsk.ru-graph.png')