#!/usr/bin/python
# coding: utf-8
import os
import codecs
import urllib2
import urlparse
import re
import requests
import lxml
from requests.exceptions import *
from functions import get_keywords_list, found_kw
from sites_left import courant, chicagotribune, washingtonpost, baltimoresun, nytimes, chroncom, dallasmorningnews, denverpost, nydailynews, newsday, latimes, sanfranciscochronicle, detroitfreepress
from sites_right import wsj, columbusdispatch, nypost, thewashingtontimes, newsmax, nationalreview, outsidethebeltway, weeklystandard, americanthinker, dailycaller, voiceofamerica, cityjournal
class FoundEnough(Exception): pass
def get_content(link):
try:
req = requests.get(link)
text = req.text
req.close()
return text
except (RequestException, ConnectionError, HTTPError, URLRequired, TooManyRedirects, Timeout):
return None
def count_tokens(article):
return len(re.findall(r"[a-zA-Z']+", article))
def parse_site(name, links_function, parse_function):
global token_count, result_dir, cache
if not os.path.exists(os.path.join(result_dir, name)):
os.mkdir(os.path.join(result_dir, name))
i = len(os.listdir(os.path.join(result_dir, name))) + 1
for link in links_function():
if token_count > 500000:
print "500000 токенов найдено, завершаем."
raise FoundEnough
print u"Разбираем статью %s" % link,
if link in cache:
print u"(она в кэше, пропускаем)"
continue
try:
article_html = get_content(link)
if article_html is None:
print
continue
except requests.exceptions.ConnectionError:
print u"HTTP error"
continue
try:
article = parse_function(article_html)
print u"(длина статьи - %s символов)" % len(article)
except (lxml.etree.ParserError, ValueError, lxml.etree.XMLSyntaxError):
print u"parse error"
continue
except ValueError:
print u"value error"
continue
for kw in keywords:
if found_kw(kw, article):
print u" В статье %s найдена фраза \"%s\"" % (link, kw)
f = codecs.open(os.path.join(result_dir, name, "%i.txt" % i), 'w', 'utf-8')
f.write(article)
f.close()
token_count += count_tokens(article)
i += 1
break
cache.append(link)
try:
token_count = 0
cache = []
keywords = get_keywords_list('left')
result_dir = 'result_left'
current = 'left'
try:
parse_site('courant', courant.links, courant.parse)
parse_site('chicagotribune', chicagotribune.links, chicagotribune.parse)
parse_site('washingtonpost', washingtonpost.links, washingtonpost.parse)
parse_site('baltimoresun', baltimoresun.links, baltimoresun.parse)
parse_site('nytimes', nytimes.links, nytimes.parse)
parse_site('chroncom', chroncom.links, chroncom.parse)
parse_site('dallasmorningnews', dallasmorningnews.links, dallasmorningnews.parse)
parse_site('denverpost', denverpost.links, denverpost.parse)
parse_site('nydailynews', nydailynews.links, nydailynews.parse)
parse_site('newsday', newsday.links, newsday.parse)
parse_site('latimes', latimes.links, latimes.parse)
parse_site('sanfranciscochronicle', sanfranciscochronicle.links, sanfranciscochronicle.parse)
parse_site('detroitfreepress', detroitfreepress.links, detroitfreepress.parse)
except FoundEnough:
print "Найдено 500000 \"левых\" статей, переходим к правым"
token_count = 0
cache = []
keywords = get_keywords_list('right')
result_dir = 'result_right'
current = 'right'
try:
parse_site('wsj', wsj.links, wsj.parse)
parse_site('columbusdispatch', columbusdispatch.links, columbusdispatch.parse)
parse_site('nypost', nypost.links, nypost.parse)
parse_site('thewashingtontimes', thewashingtontimes.links, thewashingtontimes.parse)
parse_site('newsmax', newsmax.links, newsmax.parse)
parse_site('nationalreview', nationalreview.links, nationalreview.parse)
parse_site('outsidethebeltway', outsidethebeltway.links, outsidethebeltway.parse)
parse_site('weeklystandard', weeklystandard.links, weeklystandard.parse)
parse_site('americanthinker', americanthinker.links, americanthinker.parse)
parse_site('dailycaller', dailycaller.links, dailycaller.parse)
parse_site('voiceofamerica', voiceofamerica.links, voiceofamerica.parse)
parse_site('cityjournal', cityjournal.links, cityjournal.parse)
except FoundEnough:
print "Найдено 500000 \"правых\" статей, завершаем"
except KeyboardInterrupt:
pass
finally:
print
print "Было получено %s токенов" % token_count