from grab spider import Spider Task from grab import Grab from pprint

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from grab.spider import Spider, Task
from grab import Grab
from pprint import pprint
import re
from weblib.rex import rex_text
from weblib.etree import render_node
import logging
from weblib.error import DataNotValid, DataNotFound
from grab.spider.decorators import integrity
from project.database import db
class DoiNotFound(DataNotFound):
pass
class JstageSpider(Spider):
def parse_doi(self, elem):
try:
data = elem.select('.//p[@class="doi"]').text()
except IndexError:
raise DoiNotFound('DOI not found')
else:
return data.split('.org/')[1]
def parse_details(self, elem):
result = {
'volume': int(elem.select('.//h1/a[1]').text()\
.replace('Vol. ', '').strip()),
'year': int(elem.select(
'.//h1/a[1]/following-sibling::text()').text().strip('()')),
'number': elem.select('.//h1/a[2]').text()\
.replace('No. ', '').strip(),
'pages': elem.select(
'.//h1/a[2]/following-sibling::text()'
'[translate(normalize-space(.), " ", "") != ""]'
).text(default='').replace('p. ', '')\
.replace('P ', '').strip(),
}
return result
def normalize_url(self, grab, url):
if url.startswith('//'):
url = 'http:' + url
if url.startswith('/'):
url = grab.make_url_absolute(url)
return url
def parse_title(self, grab, elem):
node = elem.select('.//h2').node(default=None)
if node is None:
return None
else:
for elem in node.xpath('.//*'):
if 'src' in elem.attrib:
elem.attrib['src'] = self.normalize_url(grab, elem.attrib['src'])
if 'href' in elem.attrib:
elem.attrib['href'] = self.normalize_url(grab, elem.attrib['href'])
data = render_node(node)
data = data.strip()
data = re.sub('^<h2[^>]*>', '', data)
data = re.sub('</h2>$', '', data)
data = data.strip()
return data
def integrity_basic(self, grab):
if not grab.doc('//a[text()="About My J-STAGE"]').exists():
raise DataNotValid('No about-my-j-stage link')
def integrity_article_page(self, grab):
if not grab.doc('//div[@class="mod-page-heading-container"]').exists():
raise DataNotValid('No mod-page-heading-container block')
@integrity('integrity_basic')
def task_home(self, grab, task):
for elem in grab.doc('//div[@id="tree"]/ul/li/ul/li/a'):
url = elem.attr('onclick')
url = url.split("href='")[1].split("'")[0]
url = grab.make_url_absolute(url)
yield Task('number', url=url, page=1)
@integrity('integrity_basic')
def task_number(self, grab, task):
self.stat.inc('number')
for elem in grab.doc('//h3[@class="mod-item-heading"]/a'):
url = grab.make_url_absolute(elem.attr('href'))
doc_type = elem.select('./../../../../preceding-sibling::h2').text()
yield Task('issue', url=url, ref=task.url, doc_type=doc_type)
jumps = []
for elem in grab.doc('//li[@class="jump"]/a'):
jumps.append(elem.text())
db.jump.save({
'_id': task.url,
'jumps': jumps,
'page': task.get('page', 1),
})
self.process_next_page(
grab, task, '//li[@class="jump"]/a[text()=">"]/@href')
@integrity('integrity_basic')
@integrity('integrity_article_page')
def task_issue(self, grab, task):
try:
head_elem = grab.doc('//div[@class="mod-page-heading-container"]').one()
body_elem = grab.doc('//div[@class="mod-article-info"]').one()
doi = self.parse_doi(head_elem)
issue = {
'_id': 'jstage-' + doi,
'doi': doi,
'title': self.parse_title(grab, body_elem),
'source': 'jstage',
'url': task.url,
'doc_type': task.doc_type,
}
issue.update(self.parse_details(head_elem))
except DoiNotFound:
self.stat.inc('doi-not-found')
self.stat.inc('doi-not-found-doc-type-%s' % task.doc_type)
db.doi_not_found.save({
'_id': task.url,
'doc_type': task.doc_type,
'ref': task.ref,
})
print('DOI NOT FOUND')
except Exception as ex:
#logging.error('', exc_info=ex)
#import pdb; pdb.set_trace()
raise
else:
self.stat.inc('issue')
db.issue.save(issue)
#pprint(issue)
def task_generator(self):
yield Task('home', url='https://www.jstage.jst.go.jp/browse/cpb')
def update_grab_instance(self, grab):
grab.setup(timeout=120, connect_timeout=60)
def prepare(self):
db.issue.remove({'source': 'nrc'})
db.jump.drop()
db.doi_not_found.drop()