import rdflib from rdflib import Graph URIRef from rdflib namespace im

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import rdflib
from rdflib import Graph, URIRef
from rdflib.namespace import RDF
from SPARQLWrapper import SPARQLWrapper, JSON
import psycopg2
import datetime
import threading
import time
threadLimiter = threading.BoundedSemaphore(50)
today_date = datetime.date.today()
host = "localhost"
sparqlHost = "http://localhost:8890/sparql"
counter = 0
uri = u'http://dbpedia.org/resource/Yugoslavia'
class myThread (threading.Thread):
def __init__(self, follower_id):
threading.Thread.__init__(self)
self.name = follower_id
def run(self):
threadLimiter.acquire()
try:
parseURI(self.name)
finally:
threadLimiter.release()
def parseURI(uri):
global maxNum
global today_date
global counter
try:
conn = psycopg2.connect("dbname=postgres user=postgres password=dba host=" +host)
cur = conn.cursor()
except:
conn = psycopg2.connect("dbname=postgres user=postgres password=dba host=" +host)
cur = conn.cursor()
g=rdflib.Graph()
lst=[]
if("dbpedia" not in uri):
try:
cur.execute("update uri set updated = %s where uri = %s ",('TRUE',uri))
conn.commit()
except:
cur.execute("update uri set updated = %s where uri = %s ",('TRUE',uri))
conn.commit()
cur.close()
conn.close()
return;
try:
g.parse(uri)
except:
#print "except: " + uri
try:
cur.execute("update uri set updated = %s where uri = %s ",('TRUE',uri))
conn.commit()
except:
cur.execute("update uri set updated = %s where uri = %s ",('TRUE',uri))
conn.commit()
cur.close()
conn.close()
return
for s,p,o in g:
a = o.encode('utf-8')
lst.append(a)
sparql = SPARQLWrapper("http://162.243.45.202:8890/sparql")
sparql.setQuery("""
PREFIX foaf:<http://xmlns.com/foaf/spec/#term_>
INSERT DATA
{
GRAPH <"""+uri+""">
{
<"""+s+"""> <"""+p+"""> <"""+o.encode('utf-8')+""">
}
}
""")
sparql.setReturnFormat(JSON)
sparql.method = 'POST'
results = sparql.query().convert()
print "Graph updated ",uri
var = ""
unique = set(lst)
for i in unique:
var = i
if("http://" in i):
try:
cur.execute("INSERT INTO uri (uri, updated_date, updated, graph_updated ) VALUES (%s, %s, %s, %s)",(i, today_date, 'FALSE', 'FALSE'))
conn.commit()
counter += 1
thread1 = myThread(i)
thread1.start()
print "counter: " + str(counter)
except:
counter = counter
#print "already exist " + uri
cur.close()
conn.close()
try:
conn = psycopg2.connect("dbname=postgres user=postgres password=dba host=" +host)
cur = conn.cursor()
except:
conn = psycopg2.connect("dbname=postgres user=postgres password=dba host=" +host)
cur = conn.cursor()
try:
cur.execute("update uri set updated = %s where uri = %s ",('TRUE',uri))
conn.commit()
except:
cur.execute("update uri set updated = %s where uri = %s ",('TRUE',uri))
conn.commit()
cur.close()
conn.close()
maxNum = 0
def begin():
global maxNum
conn = psycopg2.connect("dbname=postgres user=postgres password=dba host=" +host)
cur = conn.cursor()
# Oleg
#cur.execute("select count(uri) from uri where updated = 'FALSE' and id in (select id from uri where id <= (select max(id)-count(id)/10 from uri));")
#maxNum = cur.fetchone()[0]
#print str(maxNum) +" max number"
#cur.execute("select * from uri where updated = 'FALSE' and id in (select id from uri where id <= (select max(id)-count(id)/10 from uri));")
# Serik
#cur.execute("select count(uri) from uri where updated = 'FALSE' and id in (select id from uri where id < (select max(id)-count(id)/5000 from uri));")
#cur.execute("select count(uri) from uri where updated = 'FALSE' and id in (select id from uri where id > (select max(id)-40000 from uri));")
#cur.execute("select count(uri) from uri where updated = 'FALSE' and id in (select id from uri where id > 15000);")
#cur.execute("select count(uri) from uri where updated = 'FALSE' and id > 30000;")
cur.execute("select count(uri) from uri where updated = 'FALSE';")
maxNum = cur.fetchone()[0]
print str(maxNum) +" max number"
cur.execute("select * from uri where updated = 'FALSE';")
#cur.execute("select * from uri where updated = 'FALSE' and id in (select id from uri where id > (select max(id)-40000 from uri));")
#cur.execute("select * from uri where updated = 'FALSE' and id in (select id from uri where id > 15000);")
#cur.execute("select * from uri where updated = 'FALSE' and id > 30000;")
rows = cur.fetchall()
for row in rows:
cUri = row[1]
thread1 = myThread(cUri)
thread1.start()
conn.commit()
cur.close()
conn.close()
begin()