Implement processing links with href or src attributes.

3 years ago · 517ab557db
parent 29fceab613
commit 517ab557db
3 changed files with 19 additions and 19 deletions
--- a/App.py
+++ b/App.py
@ -220,7 +220,7 @@ class Links(urwid.ListBox):

    def parseLink(self, link):
        ext = link.split(".")[-1]
-        if ext.lower() in ("jpg", "jpeg", "gif", "png", "tif", "tiff"):
+        if Utils.checkPic(ext.lower()):
            os.system('nohup feh ' + link + ' </dev/null >/dev/null 2>&1 &')
        elif Utils.checkStreamingVideo(link):
            tui.destroyOverlay()
--- a/Render.py
+++ b/Render.py
@ -1,30 +1,22 @@
 from inscriptis import get_text
 import os
 import Utils
-import html.parser
+from bs4 import BeautifulSoup
 from RedditCommentsParser import RedditComments


-class LinkParser(html.parser.HTMLParser):
-    def reset(self):
-        super().reset()
-        self.links = set()
-
-    def handle_starttag(self, tag, attrs):
-        if tag == 'a':
-            for (name, value) in attrs:
-                if name == 'href':
-                    self.links.add(value)
-
-
 class Article:
    def __init__(self, articleObj):
-        Utils.writeLog(articleObj)
        content = articleObj["summary"]["content"]
-        parser = LinkParser()
-        for line in content:
-            parser.feed(line)
-        self.links = list(parser.links)
+        soup = BeautifulSoup(content)
+        links = soup.find_all(href=True)
+        media = soup.find_all(src=True)
+        links_set = set()
+        for link in links:
+            links_set.add(link['href'])
+        for m in media:
+            links_set.add(m['src'])
+        self.links = list(links_set)
        self.text = get_text(content)
        self.title = articleObj["title"]
        self.date = Utils.timestampToDate(articleObj["timestampUsec"])
--- a/Utils.py
+++ b/Utils.py
@ -2,12 +2,20 @@ from datetime import datetime
 import re

 streaming_urls = ["^https://www.youtube.com", "^https://player.odycdn.com", "^https://youtu.be"]
+pics = ["^jpg\\?*", "^jpeg\\?*", "^gif\\?*", "^png\\?*", "^tif\\?*", "^tiff\\?*"]


 def timestampToDate(ts):
    return datetime.fromtimestamp(int(ts)/1000000).strftime("%y-%m-%d %H:%M")


+def checkPic(ext):
+    for p in pics:
+        if re.search(p, ext) is not None:
+            return True
+    return False
+
+
 def checkStreamingVideo(link):
    for pattern in streaming_urls:
        if re.search(pattern, link) is not None: