usr bin python coding utf-8 import os import codecs import urllib2 imp

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#!/usr/bin/python
# coding: utf-8
import os
import codecs
import urllib2
import urlparse
import re
import requests
import lxml
from requests.exceptions import *
from functions import get_keywords_list, found_kw
from sites_left import courant, chicagotribune, washingtonpost, baltimoresun, nytimes, chroncom, dallasmorningnews, denverpost, nydailynews, newsday, latimes, sanfranciscochronicle, detroitfreepress
from sites_right import wsj, columbusdispatch, nypost, thewashingtontimes, newsmax, nationalreview, outsidethebeltway, weeklystandard, americanthinker, dailycaller, voiceofamerica, cityjournal
class FoundEnough(Exception): pass
def get_content(link):
try:
req = requests.get(link)
text = req.text
req.close()
return text
except (RequestException, ConnectionError, HTTPError, URLRequired, TooManyRedirects, Timeout):
return None
def count_tokens(article):
return len(re.findall(r"[a-zA-Z']+", article))
def parse_site(name, links_function, parse_function):
global token_count, result_dir, cache
if not os.path.exists(os.path.join(result_dir, name)):
os.mkdir(os.path.join(result_dir, name))
i = len(os.listdir(os.path.join(result_dir, name))) + 1
for link in links_function():
if token_count > 500000:
print "500000 токенов найдено, завершаем."
raise FoundEnough
print u"Разбираем статью %s" % link,
if link in cache:
print u"(она в кэше, пропускаем)"
continue
try:
article_html = get_content(link)
if article_html is None:
print
continue
except requests.exceptions.ConnectionError:
print u"HTTP error"
continue
try:
article = parse_function(article_html)
print u"(длина статьи - %s символов)" % len(article)
except (lxml.etree.ParserError, ValueError, lxml.etree.XMLSyntaxError):
print u"parse error"
continue
except ValueError:
print u"value error"
continue
for kw in keywords:
if found_kw(kw, article):
print u" В статье %s найдена фраза \"%s\"" % (link, kw)
f = codecs.open(os.path.join(result_dir, name, "%i.txt" % i), 'w', 'utf-8')
f.write(article)
f.close()
token_count += count_tokens(article)
i += 1
break
cache.append(link)
try:
token_count = 0
cache = []
keywords = get_keywords_list('left')
result_dir = 'result_left'
current = 'left'
try:
parse_site('courant', courant.links, courant.parse)
parse_site('chicagotribune', chicagotribune.links, chicagotribune.parse)
parse_site('washingtonpost', washingtonpost.links, washingtonpost.parse)
parse_site('baltimoresun', baltimoresun.links, baltimoresun.parse)
parse_site('nytimes', nytimes.links, nytimes.parse)
parse_site('chroncom', chroncom.links, chroncom.parse)
parse_site('dallasmorningnews', dallasmorningnews.links, dallasmorningnews.parse)
parse_site('denverpost', denverpost.links, denverpost.parse)
parse_site('nydailynews', nydailynews.links, nydailynews.parse)
parse_site('newsday', newsday.links, newsday.parse)
parse_site('latimes', latimes.links, latimes.parse)
parse_site('sanfranciscochronicle', sanfranciscochronicle.links, sanfranciscochronicle.parse)
parse_site('detroitfreepress', detroitfreepress.links, detroitfreepress.parse)
except FoundEnough:
print "Найдено 500000 \"левых\" статей, переходим к правым"
token_count = 0
cache = []
keywords = get_keywords_list('right')
result_dir = 'result_right'
current = 'right'
try:
parse_site('wsj', wsj.links, wsj.parse)
parse_site('columbusdispatch', columbusdispatch.links, columbusdispatch.parse)
parse_site('nypost', nypost.links, nypost.parse)
parse_site('thewashingtontimes', thewashingtontimes.links, thewashingtontimes.parse)
parse_site('newsmax', newsmax.links, newsmax.parse)
parse_site('nationalreview', nationalreview.links, nationalreview.parse)
parse_site('outsidethebeltway', outsidethebeltway.links, outsidethebeltway.parse)
parse_site('weeklystandard', weeklystandard.links, weeklystandard.parse)
parse_site('americanthinker', americanthinker.links, americanthinker.parse)
parse_site('dailycaller', dailycaller.links, dailycaller.parse)
parse_site('voiceofamerica', voiceofamerica.links, voiceofamerica.parse)
parse_site('cityjournal', cityjournal.links, cityjournal.parse)
except FoundEnough:
print "Найдено 500000 \"правых\" статей, завершаем"
except KeyboardInterrupt:
pass
finally:
print
print "Было получено %s токенов" % token_count