More cleanup

This commit is contained in:
Thomas Sileo
2018-06-17 19:21:59 +02:00
parent 4e669620bc
commit 6220064951
16 changed files with 420 additions and 789 deletions

View File

@@ -4,9 +4,9 @@ logger = logging.getLogger(__name__)
def strtobool(s: str) -> bool:
if s in ['y', 'yes', 'true', 'on', '1']:
if s in ["y", "yes", "true", "on", "1"]:
return True
if s in ['n', 'no', 'false', 'off', '0']:
if s in ["n", "no", "false", "off", "0"]:
return False
raise ValueError(f'cannot convert {s} to bool')
raise ValueError(f"cannot convert {s} to bool")

View File

@@ -1,68 +0,0 @@
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
import requests
from .errors import RecursionLimitExceededError
from .errors import UnexpectedActivityTypeError
def _do_req(url: str, headers: Dict[str, str]) -> Dict[str, Any]:
resp = requests.get(url, headers=headers)
resp.raise_for_status()
return resp.json()
def parse_collection(
payload: Optional[Dict[str, Any]] = None,
url: Optional[str] = None,
user_agent: Optional[str] = None,
level: int = 0,
do_req: Any = _do_req,
) -> List[str]:
"""Resolve/fetch a `Collection`/`OrderedCollection`."""
if level > 3:
raise RecursionLimitExceededError('recursion limit exceeded')
# Go through all the pages
headers = {'Accept': 'application/activity+json'}
if user_agent:
headers['User-Agent'] = user_agent
out: List[str] = []
if url:
payload = do_req(url, headers)
if not payload:
raise ValueError('must at least prove a payload or an URL')
if payload['type'] in ['Collection', 'OrderedCollection']:
if 'orderedItems' in payload:
return payload['orderedItems']
if 'items' in payload:
return payload['items']
if 'first' in payload:
if 'orderedItems' in payload['first']:
out.extend(payload['first']['orderedItems'])
if 'items' in payload['first']:
out.extend(payload['first']['items'])
n = payload['first'].get('next')
if n:
out.extend(parse_collection(url=n, user_agent=user_agent, level=level+1, do_req=do_req))
return out
while payload:
if payload['type'] in ['CollectionPage', 'OrderedCollectionPage']:
if 'orderedItems' in payload:
out.extend(payload['orderedItems'])
if 'items' in payload:
out.extend(payload['items'])
n = payload.get('next')
if n is None:
break
payload = do_req(n, headers)
else:
raise UnexpectedActivityTypeError('unexpected activity type {}'.format(payload['type']))
return out

View File

@@ -17,7 +17,7 @@ class NotAnActorError(Exception):
class ActorService(object):
def __init__(self, user_agent, col, actor_id, actor_data, instances):
logger.debug(f'Initializing ActorService user_agent={user_agent}')
logger.debug(f"Initializing ActorService user_agent={user_agent}")
self._user_agent = user_agent
self._col = col
self._in_mem = {actor_id: actor_data}
@@ -25,57 +25,70 @@ class ActorService(object):
self._known_instances = set()
def _fetch(self, actor_url):
logger.debug(f'fetching remote object {actor_url}')
logger.debug(f"fetching remote object {actor_url}")
check_url(actor_url)
resp = requests.get(actor_url, headers={
'Accept': 'application/activity+json',
'User-Agent': self._user_agent,
})
resp = requests.get(
actor_url,
headers={
"Accept": "application/activity+json",
"User-Agent": self._user_agent,
},
)
if resp.status_code == 404:
raise ActivityNotFoundError(f'{actor_url} cannot be fetched, 404 not found error')
raise ActivityNotFoundError(
f"{actor_url} cannot be fetched, 404 not found error"
)
resp.raise_for_status()
return resp.json()
def get(self, actor_url, reload_cache=False):
logger.info(f'get actor {actor_url} (reload_cache={reload_cache})')
logger.info(f"get actor {actor_url} (reload_cache={reload_cache})")
if actor_url in self._in_mem:
return self._in_mem[actor_url]
instance = urlparse(actor_url)._replace(path='', query='', fragment='').geturl()
instance = urlparse(actor_url)._replace(path="", query="", fragment="").geturl()
if instance not in self._known_instances:
self._known_instances.add(instance)
if not self._instances.find_one({'instance': instance}):
self._instances.insert({'instance': instance, 'first_object': actor_url})
if not self._instances.find_one({"instance": instance}):
self._instances.insert(
{"instance": instance, "first_object": actor_url}
)
if reload_cache:
actor = self._fetch(actor_url)
self._in_mem[actor_url] = actor
self._col.update({'actor_id': actor_url}, {'$set': {'cached_response': actor}}, upsert=True)
self._col.update(
{"actor_id": actor_url},
{"$set": {"cached_response": actor}},
upsert=True,
)
return actor
cached_actor = self._col.find_one({'actor_id': actor_url})
cached_actor = self._col.find_one({"actor_id": actor_url})
if cached_actor:
return cached_actor['cached_response']
return cached_actor["cached_response"]
actor = self._fetch(actor_url)
if not 'type' in actor:
if not "type" in actor:
raise NotAnActorError(None)
if actor['type'] != 'Person':
if actor["type"] != "Person":
raise NotAnActorError(actor)
self._col.update({'actor_id': actor_url}, {'$set': {'cached_response': actor}}, upsert=True)
self._col.update(
{"actor_id": actor_url}, {"$set": {"cached_response": actor}}, upsert=True
)
self._in_mem[actor_url] = actor
return actor
def get_public_key(self, actor_url, reload_cache=False):
profile = self.get(actor_url, reload_cache=reload_cache)
pub = profile['publicKey']
return pub['id'], RSA.importKey(pub['publicKeyPem'])
pub = profile["publicKey"]
return pub["id"], RSA.importKey(pub["publicKeyPem"])
def get_inbox_url(self, actor_url, reload_cache=False):
profile = self.get(actor_url, reload_cache=reload_cache)
return profile.get('inbox')
return profile.get("inbox")

View File

@@ -1,65 +0,0 @@
import re
import typing
from typing import Any
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Type
from typing import Union
from bleach.linkifier import Linker
from markdown import markdown
from config import ACTOR_SERVICE
from config import BASE_URL
from config import ID
from config import USERNAME
from utils.webfinger import get_actor_url
def set_attrs(attrs, new=False):
attrs[(None, u'target')] = u'_blank'
attrs[(None, u'class')] = u'external'
attrs[(None, u'rel')] = u'noopener'
attrs[(None, u'title')] = attrs[(None, u'href')]
return attrs
LINKER = Linker(callbacks=[set_attrs])
HASHTAG_REGEX = re.compile(r"(#[\d\w\.]+)")
MENTION_REGEX = re.compile(r"@[\d\w_.+-]+@[\d\w-]+\.[\d\w\-.]+")
def hashtagify(content: str) -> Tuple[str, List[Dict[str, str]]]:
tags = []
for hashtag in re.findall(HASHTAG_REGEX, content):
tag = hashtag[1:]
link = f'<a href="{BASE_URL}/tags/{tag}" class="mention hashtag" rel="tag">#<span>{tag}</span></a>'
tags.append(dict(href=f'{BASE_URL}/tags/{tag}', name=hashtag, type='Hashtag'))
content = content.replace(hashtag, link)
return content, tags
def mentionify(content: str) -> Tuple[str, List[Dict[str, str]]]:
tags = []
for mention in re.findall(MENTION_REGEX, content):
_, username, domain = mention.split('@')
actor_url = get_actor_url(mention)
p = ACTOR_SERVICE.get(actor_url)
print(p)
tags.append(dict(type='Mention', href=p['id'], name=mention))
link = f'<span class="h-card"><a href="{p["url"]}" class="u-url mention">@<span>{username}</span></a></span>'
content = content.replace(mention, link)
return content, tags
def parse_markdown(content: str) -> Tuple[str, List[Dict[str, str]]]:
tags = []
content = LINKER.linkify(content)
content, hashtag_tags = hashtagify(content)
tags.extend(hashtag_tags)
content, mention_tags = mentionify(content)
tags.extend(mention_tags)
content = markdown(content)
return content, tags

View File

@@ -1,37 +0,0 @@
class Error(Exception):
status_code = 400
def __init__(self, message, status_code=None, payload=None):
Exception.__init__(self)
self.message = message
if status_code is not None:
self.status_code = status_code
self.payload = payload
def to_dict(self):
rv = dict(self.payload or ())
rv['message'] = self.message
return rv
def __repr__(self):
return f'{self.__class__.__qualname__}({self.message!r}, payload={self.payload!r}, status_code={self.status_code})'
class NotFromOutboxError(Error):
pass
class ActivityNotFoundError(Error):
status_code = 404
class BadActivityError(Error):
pass
class RecursionLimitExceededError(BadActivityError):
pass
class UnexpectedActivityTypeError(BadActivityError):
pass

View File

@@ -1,95 +0,0 @@
"""Implements HTTP signature for Flask requests.
Mastodon instances won't accept requests that are not signed using this scheme.
"""
import base64
import hashlib
import logging
from datetime import datetime
from typing import Any
from typing import Dict
from typing import Optional
from urllib.parse import urlparse
from Crypto.Hash import SHA256
from Crypto.Signature import PKCS1_v1_5
from flask import request
from requests.auth import AuthBase
logger = logging.getLogger(__name__)
def _build_signed_string(signed_headers: str, method: str, path: str, headers: Any, body_digest: str) -> str:
out = []
for signed_header in signed_headers.split(' '):
if signed_header == '(request-target)':
out.append('(request-target): '+method.lower()+' '+path)
elif signed_header == 'digest':
out.append('digest: '+body_digest)
else:
out.append(signed_header+': '+headers[signed_header])
return '\n'.join(out)
def _parse_sig_header(val: Optional[str]) -> Optional[Dict[str, str]]:
if not val:
return None
out = {}
for data in val.split(','):
k, v = data.split('=', 1)
out[k] = v[1:len(v)-1]
return out
def _verify_h(signed_string, signature, pubkey):
signer = PKCS1_v1_5.new(pubkey)
digest = SHA256.new()
digest.update(signed_string.encode('utf-8'))
return signer.verify(digest, signature)
def _body_digest() -> str:
h = hashlib.new('sha256')
h.update(request.data)
return 'SHA-256='+base64.b64encode(h.digest()).decode('utf-8')
def verify_request(actor_service) -> bool:
hsig = _parse_sig_header(request.headers.get('Signature'))
if not hsig:
logger.debug('no signature in header')
return False
logger.debug(f'hsig={hsig}')
signed_string = _build_signed_string(hsig['headers'], request.method, request.path, request.headers, _body_digest())
_, rk = actor_service.get_public_key(hsig['keyId'])
return _verify_h(signed_string, base64.b64decode(hsig['signature']), rk)
class HTTPSigAuth(AuthBase):
def __init__(self, keyid, privkey):
self.keyid = keyid
self.privkey = privkey
def __call__(self, r):
logger.info(f'keyid={self.keyid}')
host = urlparse(r.url).netloc
bh = hashlib.new('sha256')
bh.update(r.body.encode('utf-8'))
bodydigest = 'SHA-256='+base64.b64encode(bh.digest()).decode('utf-8')
date = datetime.utcnow().strftime('%a, %d %b %Y %H:%M:%S GMT')
r.headers.update({'Digest': bodydigest, 'Date': date})
r.headers.update({'Host': host})
sigheaders = '(request-target) user-agent host date digest content-type'
to_be_signed = _build_signed_string(sigheaders, r.method, r.path_url, r.headers, bodydigest)
signer = PKCS1_v1_5.new(self.privkey)
digest = SHA256.new()
digest.update(to_be_signed.encode('utf-8'))
sig = base64.b64encode(signer.sign(digest))
sig = sig.decode('utf-8')
headers = {
'Signature': f'keyId="{self.keyid}",algorithm="rsa-sha256",headers="{sigheaders}",signature="{sig}"'
}
logger.info(f'signed request headers={headers}')
r.headers.update(headers)
return r

View File

@@ -1,69 +0,0 @@
import base64
import hashlib
from datetime import datetime
from typing import Any
from typing import Dict
from Crypto.Hash import SHA256
from Crypto.Signature import PKCS1_v1_5
from pyld import jsonld
# cache the downloaded "schemas", otherwise the library is super slow
# (https://github.com/digitalbazaar/pyld/issues/70)
_CACHE: Dict[str, Any] = {}
LOADER = jsonld.requests_document_loader()
def _caching_document_loader(url: str) -> Any:
if url in _CACHE:
return _CACHE[url]
resp = LOADER(url)
_CACHE[url] = resp
return resp
jsonld.set_document_loader(_caching_document_loader)
def options_hash(doc):
doc = dict(doc['signature'])
for k in ['type', 'id', 'signatureValue']:
if k in doc:
del doc[k]
doc['@context'] = 'https://w3id.org/identity/v1'
normalized = jsonld.normalize(doc, {'algorithm': 'URDNA2015', 'format': 'application/nquads'})
h = hashlib.new('sha256')
h.update(normalized.encode('utf-8'))
return h.hexdigest()
def doc_hash(doc):
doc = dict(doc)
if 'signature' in doc:
del doc['signature']
normalized = jsonld.normalize(doc, {'algorithm': 'URDNA2015', 'format': 'application/nquads'})
h = hashlib.new('sha256')
h.update(normalized.encode('utf-8'))
return h.hexdigest()
def verify_signature(doc, pubkey):
to_be_signed = options_hash(doc) + doc_hash(doc)
signature = doc['signature']['signatureValue']
signer = PKCS1_v1_5.new(pubkey)
digest = SHA256.new()
digest.update(to_be_signed.encode('utf-8'))
return signer.verify(digest, base64.b64decode(signature))
def generate_signature(doc, privkey):
options = {
'type': 'RsaSignature2017',
'creator': doc['actor'] + '#main-key',
'created': datetime.utcnow().replace(microsecond=0).isoformat() + 'Z',
}
doc['signature'] = options
to_be_signed = options_hash(doc) + doc_hash(doc)
signer = PKCS1_v1_5.new(privkey)
digest = SHA256.new()
digest.update(to_be_signed.encode('utf-8'))
sig = base64.b64encode(signer.sign(digest))
options['signatureValue'] = sig.decode('utf-8')

View File

@@ -16,53 +16,100 @@ class ObjectService(object):
self._known_instances = set()
def _fetch_remote(self, object_id):
print(f'fetch remote {object_id}')
print(f"fetch remote {object_id}")
check_url(object_id)
resp = requests.get(object_id, headers={
'Accept': 'application/activity+json',
'User-Agent': self._user_agent,
})
resp = requests.get(
object_id,
headers={
"Accept": "application/activity+json",
"User-Agent": self._user_agent,
},
)
if resp.status_code == 404:
raise ActivityNotFoundError(f'{object_id} cannot be fetched, 404 error not found')
raise ActivityNotFoundError(
f"{object_id} cannot be fetched, 404 error not found"
)
resp.raise_for_status()
return resp.json()
def _fetch(self, object_id):
instance = urlparse(object_id)._replace(path='', query='', fragment='').geturl()
instance = urlparse(object_id)._replace(path="", query="", fragment="").geturl()
if instance not in self._known_instances:
self._known_instances.add(instance)
if not self._instances.find_one({'instance': instance}):
self._instances.insert({'instance': instance, 'first_object': object_id})
if not self._instances.find_one({"instance": instance}):
self._instances.insert(
{"instance": instance, "first_object": object_id}
)
obj = self._inbox.find_one({'$or': [{'remote_id': object_id}, {'type': 'Create', 'activity.object.id': object_id}]})
obj = self._inbox.find_one(
{
"$or": [
{"remote_id": object_id},
{"type": "Create", "activity.object.id": object_id},
]
}
)
if obj:
if obj['remote_id'] == object_id:
return obj['activity']
return obj['activity']['object']
if obj["remote_id"] == object_id:
return obj["activity"]
return obj["activity"]["object"]
obj = self._outbox.find_one({'$or': [{'remote_id': object_id}, {'type': 'Create', 'activity.object.id': object_id}]})
obj = self._outbox.find_one(
{
"$or": [
{"remote_id": object_id},
{"type": "Create", "activity.object.id": object_id},
]
}
)
if obj:
if obj['remote_id'] == object_id:
return obj['activity']
return obj['activity']['object']
if obj["remote_id"] == object_id:
return obj["activity"]
return obj["activity"]["object"]
return self._fetch_remote(object_id)
def get(self, object_id, reload_cache=False, part_of_stream=False, announce_published=None):
def get(
self,
object_id,
reload_cache=False,
part_of_stream=False,
announce_published=None,
):
if reload_cache:
obj = self._fetch(object_id)
self._col.update({'object_id': object_id}, {'$set': {'cached_object': obj, 'meta.part_of_stream': part_of_stream, 'meta.announce_published': announce_published}}, upsert=True)
self._col.update(
{"object_id": object_id},
{
"$set": {
"cached_object": obj,
"meta.part_of_stream": part_of_stream,
"meta.announce_published": announce_published,
}
},
upsert=True,
)
return obj
cached_object = self._col.find_one({'object_id': object_id})
cached_object = self._col.find_one({"object_id": object_id})
if cached_object:
print(f'ObjectService: {cached_object}')
return cached_object['cached_object']
print(f"ObjectService: {cached_object}")
return cached_object["cached_object"]
obj = self._fetch(object_id)
self._col.update({'object_id': object_id}, {'$set': {'cached_object': obj, 'meta.part_of_stream': part_of_stream, 'meta.announce_published': announce_published}}, upsert=True)
self._col.update(
{"object_id": object_id},
{
"$set": {
"cached_object": obj,
"meta.part_of_stream": part_of_stream,
"meta.announce_published": announce_published,
}
},
upsert=True,
)
# print(f'ObjectService: {obj}')
return obj

View File

@@ -1,37 +1,34 @@
import ipaddress
from urllib.parse import urlparse
import opengraph
import requests
from bs4 import BeautifulSoup
from .urlutils import check_url
from .urlutils import is_url_valid
from little_boxes.urlutils import check_url
from little_boxes.urlutils import is_url_valid
def links_from_note(note):
tags_href= set()
for t in note.get('tag', []):
h = t.get('href')
tags_href = set()
for t in note.get("tag", []):
h = t.get("href")
if h:
# TODO(tsileo): fetch the URL for Actor profile, type=mention
tags_href.add(h)
links = set()
soup = BeautifulSoup(note['content'])
for link in soup.find_all('a'):
h = link.get('href')
if h.startswith('http') and h not in tags_href and is_url_valid(h):
soup = BeautifulSoup(note["content"])
for link in soup.find_all("a"):
h = link.get("href")
if h.startswith("http") and h not in tags_href and is_url_valid(h):
links.add(h)
return links
def fetch_og_metadata(user_agent, col, remote_id):
doc = col.find_one({'remote_id': remote_id})
doc = col.find_one({"remote_id": remote_id})
if not doc:
raise ValueError
note = doc['activity']['object']
note = doc["activity"]["object"]
print(note)
links = links_from_note(note)
if not links:
@@ -40,9 +37,11 @@ def fetch_og_metadata(user_agent, col, remote_id):
htmls = []
for l in links:
check_url(l)
r = requests.get(l, headers={'User-Agent': user_agent})
r = requests.get(l, headers={"User-Agent": user_agent})
r.raise_for_status()
htmls.append(r.text)
links_og_metadata = [dict(opengraph.OpenGraph(html=html)) for html in htmls]
col.update_one({'remote_id': remote_id}, {'$set': {'meta.og_metadata': links_og_metadata}})
col.update_one(
{"remote_id": remote_id}, {"$set": {"meta.og_metadata": links_og_metadata}}
)
return len(links)

View File

@@ -1,47 +0,0 @@
import ipaddress
import logging
import os
import socket
from urllib.parse import urlparse
from . import strtobool
from .errors import Error
logger = logging.getLogger(__name__)
class InvalidURLError(Error):
pass
def is_url_valid(url: str) -> bool:
parsed = urlparse(url)
if parsed.scheme not in ['http', 'https']:
return False
# XXX in debug mode, we want to allow requests to localhost to test the federation with local instances
debug_mode = strtobool(os.getenv('MICROBLOGPUB_DEBUG', 'false'))
if debug_mode:
return True
if parsed.hostname in ['localhost']:
return False
try:
ip_address = socket.getaddrinfo(parsed.hostname, parsed.port or 80)[0][4][0]
except socket.gaierror:
logger.exception(f'failed to lookup url {url}')
return False
if ipaddress.ip_address(ip_address).is_private:
logger.info(f'rejecting private URL {url}')
return False
return True
def check_url(url: str) -> None:
if not is_url_valid(url):
raise InvalidURLError(f'"{url}" is invalid')
return None

View File

@@ -1,75 +0,0 @@
import logging
from typing import Any
from typing import Dict
from typing import Optional
from urllib.parse import urlparse
import requests
from .urlutils import check_url
logger = logging.getLogger(__name__)
def webfinger(resource: str) -> Optional[Dict[str, Any]]:
"""Mastodon-like WebFinger resolution to retrieve the activity stream Actor URL.
"""
logger.info(f'performing webfinger resolution for {resource}')
protos = ['https', 'http']
if resource.startswith('http://'):
protos.reverse()
host = urlparse(resource).netloc
elif resource.startswith('https://'):
host = urlparse(resource).netloc
else:
if resource.startswith('acct:'):
resource = resource[5:]
if resource.startswith('@'):
resource = resource[1:]
_, host = resource.split('@', 1)
resource='acct:'+resource
# Security check on the url (like not calling localhost)
check_url(f'https://{host}')
for i, proto in enumerate(protos):
try:
url = f'{proto}://{host}/.well-known/webfinger'
resp = requests.get(
url,
{'resource': resource}
)
except requests.ConnectionError:
# If we tried https first and the domain is "http only"
if i == 0:
continue
break
if resp.status_code == 404:
return None
resp.raise_for_status()
return resp.json()
def get_remote_follow_template(resource: str) -> Optional[str]:
data = webfinger(resource)
if data is None:
return None
for link in data['links']:
if link.get('rel') == 'http://ostatus.org/schema/1.0/subscribe':
return link.get('template')
return None
def get_actor_url(resource: str) -> Optional[str]:
"""Mastodon-like WebFinger resolution to retrieve the activity stream Actor URL.
Returns:
the Actor URL or None if the resolution failed.
"""
data = webfinger(resource)
if data is None:
return None
for link in data['links']:
if link.get('rel') == 'self' and link.get('type') == 'application/activity+json':
return link.get('href')
return None