More cleanup
This commit is contained in:
@@ -1,37 +1,34 @@
|
||||
import ipaddress
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import opengraph
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .urlutils import check_url
|
||||
from .urlutils import is_url_valid
|
||||
from little_boxes.urlutils import check_url
|
||||
from little_boxes.urlutils import is_url_valid
|
||||
|
||||
|
||||
def links_from_note(note):
|
||||
tags_href= set()
|
||||
for t in note.get('tag', []):
|
||||
h = t.get('href')
|
||||
tags_href = set()
|
||||
for t in note.get("tag", []):
|
||||
h = t.get("href")
|
||||
if h:
|
||||
# TODO(tsileo): fetch the URL for Actor profile, type=mention
|
||||
tags_href.add(h)
|
||||
|
||||
links = set()
|
||||
soup = BeautifulSoup(note['content'])
|
||||
for link in soup.find_all('a'):
|
||||
h = link.get('href')
|
||||
if h.startswith('http') and h not in tags_href and is_url_valid(h):
|
||||
soup = BeautifulSoup(note["content"])
|
||||
for link in soup.find_all("a"):
|
||||
h = link.get("href")
|
||||
if h.startswith("http") and h not in tags_href and is_url_valid(h):
|
||||
links.add(h)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
def fetch_og_metadata(user_agent, col, remote_id):
|
||||
doc = col.find_one({'remote_id': remote_id})
|
||||
doc = col.find_one({"remote_id": remote_id})
|
||||
if not doc:
|
||||
raise ValueError
|
||||
note = doc['activity']['object']
|
||||
note = doc["activity"]["object"]
|
||||
print(note)
|
||||
links = links_from_note(note)
|
||||
if not links:
|
||||
@@ -40,9 +37,11 @@ def fetch_og_metadata(user_agent, col, remote_id):
|
||||
htmls = []
|
||||
for l in links:
|
||||
check_url(l)
|
||||
r = requests.get(l, headers={'User-Agent': user_agent})
|
||||
r = requests.get(l, headers={"User-Agent": user_agent})
|
||||
r.raise_for_status()
|
||||
htmls.append(r.text)
|
||||
links_og_metadata = [dict(opengraph.OpenGraph(html=html)) for html in htmls]
|
||||
col.update_one({'remote_id': remote_id}, {'$set': {'meta.og_metadata': links_og_metadata}})
|
||||
col.update_one(
|
||||
{"remote_id": remote_id}, {"$set": {"meta.og_metadata": links_og_metadata}}
|
||||
)
|
||||
return len(links)
|
||||
|
Reference in New Issue
Block a user