Functions for easy parsing RSS and ATOM feeds import sha import re fro

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
"""
Functions for easy parsing RSS and ATOM feeds.
"""
import sha
import re
from time import mktime
from datetime import datetime
import feedparser
import clean
def parse_time(time):
"""
Parse datetime from string.
"""
return datetime.fromtimestamp(mktime(time))
def parse_modified_date(entry):
"""
Find out modified date of feed entry.
"""
if hasattr(entry, 'modified_parsed'):
return parse_time(entry.modified_parsed)
if hasattr(entry, 'modified'):
return parse_time(entry.modified)
return datetime.now()
def get_tags(entry):
"""
Returns a list of tag objects from an entry.
"""
tags = set()
if 'tags' in entry:
for tag in entry.tags:
if getattr(tag, 'label', None):
term = tag.label
else:
term = getattr(tag, 'term', '')
terms = term.strip().replace(',', '/').split('/')
tags.update(x.strip() for x in terms if x.strip())
return tags
def parse_feed(url=None, source_data=None, summary_size=1000, etag=None):
"""
Parse feed from url or source data.
Returns dict with feed, entries and success flag
"""
if not url and not source_data:
raise Exception('parse_feed requires url or source_data argument')
resp = {'feed': None, 'success': False, 'entries': []}
try:
resp['feed'] = feedparser.parse(url and url or source_data)
except Exception, ex:
pass
else:
resp['success'] = True
if resp['success'] and url:
if hasattr(resp['feed'], 'status'):
if etag and 304 == resp['feed'].status:
logging.debug('Feed has not been changed since last check')
return resp
if 400 < resp['feed'].status:
return resp
if not resp['feed'].get('etag'):
resp['feed'].etag = ''
resp['feed'].last_checked = datetime.now()
for entry in resp['feed'].entries:
title = getattr(entry, 'title', 'untitled')
link = getattr(entry, 'link', '')
if hasattr(entry,'content'):
content = entry.content[0].value
elif hasattr(entry,'summary'):
content = entry.summary
elif hasattr(entry,'description'):
content = entry.description
else:
continue
summary = content[:summary_size]
summary = clean.safe_html(summary)
content = clean.safe_html(content)
time_created = parse_modified_date(entry)
tags = get_tags(entry)
guid = sha.new(link.encode('utf-8')).hexdigest()
entry = {'title': title, 'link': link, 'summary': summary,
'content': content, 'time_created': time_created,
'guid': guid, 'tags': tags}
resp['entries'].append(entry)
return resp