|
|
|
@ -1,30 +1,22 @@
|
|
|
|
from inscriptis import get_text
|
|
|
|
from inscriptis import get_text
|
|
|
|
import os
|
|
|
|
import os
|
|
|
|
import Utils
|
|
|
|
import Utils
|
|
|
|
import html.parser
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from RedditCommentsParser import RedditComments
|
|
|
|
from RedditCommentsParser import RedditComments
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LinkParser(html.parser.HTMLParser):
|
|
|
|
|
|
|
|
def reset(self):
|
|
|
|
|
|
|
|
super().reset()
|
|
|
|
|
|
|
|
self.links = set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
|
|
|
|
if tag == 'a':
|
|
|
|
|
|
|
|
for (name, value) in attrs:
|
|
|
|
|
|
|
|
if name == 'href':
|
|
|
|
|
|
|
|
self.links.add(value)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Article:
|
|
|
|
class Article:
|
|
|
|
def __init__(self, articleObj):
|
|
|
|
def __init__(self, articleObj):
|
|
|
|
Utils.writeLog(articleObj)
|
|
|
|
|
|
|
|
content = articleObj["summary"]["content"]
|
|
|
|
content = articleObj["summary"]["content"]
|
|
|
|
parser = LinkParser()
|
|
|
|
soup = BeautifulSoup(content)
|
|
|
|
for line in content:
|
|
|
|
links = soup.find_all(href=True)
|
|
|
|
parser.feed(line)
|
|
|
|
media = soup.find_all(src=True)
|
|
|
|
self.links = list(parser.links)
|
|
|
|
links_set = set()
|
|
|
|
|
|
|
|
for link in links:
|
|
|
|
|
|
|
|
links_set.add(link['href'])
|
|
|
|
|
|
|
|
for m in media:
|
|
|
|
|
|
|
|
links_set.add(m['src'])
|
|
|
|
|
|
|
|
self.links = list(links_set)
|
|
|
|
self.text = get_text(content)
|
|
|
|
self.text = get_text(content)
|
|
|
|
self.title = articleObj["title"]
|
|
|
|
self.title = articleObj["title"]
|
|
|
|
self.date = Utils.timestampToDate(articleObj["timestampUsec"])
|
|
|
|
self.date = Utils.timestampToDate(articleObj["timestampUsec"])
|
|
|
|
|