Move stuff arround
This commit is contained in:
182
core/gc.py
Normal file
182
core/gc.py
Normal file
@@ -0,0 +1,182 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from time import perf_counter
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
from little_boxes import activitypub as ap
|
||||
|
||||
import activitypub
|
||||
from activitypub import Box
|
||||
from config import DAYS_TO_KEEP
|
||||
from config import ID
|
||||
from config import ME
|
||||
from config import MEDIA_CACHE
|
||||
from utils.migrations import DB
|
||||
|
||||
back = activitypub.MicroblogPubBackend()
|
||||
ap.use_backend(back)
|
||||
|
||||
MY_PERSON = ap.Person(**ME)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def threads_of_interest() -> List[str]:
|
||||
out = set()
|
||||
|
||||
# Fetch all the threads we've participed in
|
||||
for data in DB.activities.find(
|
||||
{
|
||||
"meta.thread_root_parent": {"$exists": True},
|
||||
"box": Box.OUTBOX.value,
|
||||
"type": ap.ActivityType.CREATE.value,
|
||||
}
|
||||
):
|
||||
out.add(data["meta"]["thread_root_parent"])
|
||||
|
||||
# Fetch all threads related to bookmarked activities
|
||||
for data in DB.activities.find({"meta.bookmarked": True}):
|
||||
# Keep the replies
|
||||
out.add(data["meta"]["object_id"])
|
||||
# And the whole thread if any
|
||||
if "thread_root_parent" in data["meta"]:
|
||||
out.add(data["meta"]["thread_root_parent"])
|
||||
|
||||
return list(out)
|
||||
|
||||
|
||||
def _keep(data: Dict[str, Any]) -> None:
|
||||
DB.activities.update_one({"_id": data["_id"]}, {"$set": {"meta.gc_keep": True}})
|
||||
|
||||
|
||||
def perform() -> None: # noqa: C901
|
||||
start = perf_counter()
|
||||
d = (datetime.utcnow() - timedelta(days=DAYS_TO_KEEP)).strftime("%Y-%m-%d")
|
||||
toi = threads_of_interest()
|
||||
logger.info(f"thread_of_interest={toi!r}")
|
||||
|
||||
create_deleted = 0
|
||||
create_count = 0
|
||||
# Go over the old Create activities
|
||||
for data in DB.activities.find(
|
||||
{
|
||||
"box": Box.INBOX.value,
|
||||
"type": ap.ActivityType.CREATE.value,
|
||||
"activity.published": {"$lt": d},
|
||||
"meta.gc_keep": {"$exists": False},
|
||||
}
|
||||
).limit(500):
|
||||
try:
|
||||
create_count += 1
|
||||
remote_id = data["remote_id"]
|
||||
meta = data["meta"]
|
||||
activity = ap.parse_activity(data["activity"])
|
||||
logger.info(f"activity={activity!r}")
|
||||
|
||||
# This activity has been bookmarked, keep it
|
||||
if meta.get("bookmarked"):
|
||||
_keep(data)
|
||||
continue
|
||||
|
||||
# Inspect the object
|
||||
obj = activity.get_object()
|
||||
|
||||
# This activity mentions the server actor, keep it
|
||||
if obj.has_mention(ID):
|
||||
_keep(data)
|
||||
continue
|
||||
|
||||
# This activity is a direct reply of one the server actor activity, keep it
|
||||
in_reply_to = obj.get_in_reply_to()
|
||||
if in_reply_to and in_reply_to.startswith(ID):
|
||||
_keep(data)
|
||||
continue
|
||||
|
||||
# This activity is part of a thread we want to keep, keep it
|
||||
if in_reply_to and meta.get("thread_root_parent"):
|
||||
thread_root_parent = meta["thread_root_parent"]
|
||||
if thread_root_parent.startswith(ID) or thread_root_parent in toi:
|
||||
_keep(data)
|
||||
continue
|
||||
|
||||
# This activity was boosted or liked, keep it
|
||||
if meta.get("boosted") or meta.get("liked"):
|
||||
_keep(data)
|
||||
continue
|
||||
|
||||
# TODO(tsileo): remove after tests
|
||||
if meta.get("keep"):
|
||||
logger.warning(
|
||||
f"{activity!r} would not have been deleted, skipping for now"
|
||||
)
|
||||
_keep(data)
|
||||
continue
|
||||
|
||||
# Delete the cached attachment
|
||||
for grid_item in MEDIA_CACHE.fs.find({"remote_id": remote_id}):
|
||||
MEDIA_CACHE.fs.delete(grid_item._id)
|
||||
|
||||
# Delete the activity
|
||||
DB.activities.delete_one({"_id": data["_id"]})
|
||||
create_deleted += 1
|
||||
except Exception:
|
||||
logger.exception(f"failed to process {data!r}")
|
||||
|
||||
after_gc_create = perf_counter()
|
||||
time_to_gc_create = after_gc_create - start
|
||||
logger.info(
|
||||
f"{time_to_gc_create:.2f} seconds to analyze {create_count} Create, {create_deleted} deleted"
|
||||
)
|
||||
|
||||
announce_count = 0
|
||||
announce_deleted = 0
|
||||
# Go over the old Create activities
|
||||
for data in DB.activities.find(
|
||||
{
|
||||
"box": Box.INBOX.value,
|
||||
"type": ap.ActivityType.ANNOUNCE.value,
|
||||
"activity.published": {"$lt": d},
|
||||
"meta.gc_keep": {"$exists": False},
|
||||
}
|
||||
).limit(500):
|
||||
try:
|
||||
announce_count += 1
|
||||
remote_id = data["remote_id"]
|
||||
meta = data["meta"]
|
||||
activity = ap.parse_activity(data["activity"])
|
||||
logger.info(f"activity={activity!r}")
|
||||
|
||||
# This activity has been bookmarked, keep it
|
||||
if meta.get("bookmarked"):
|
||||
_keep(data)
|
||||
continue
|
||||
|
||||
object_id = activity.get_object_id()
|
||||
|
||||
# This announce is for a local activity (i.e. from the outbox), keep it
|
||||
if object_id.startswith(ID):
|
||||
_keep(data)
|
||||
continue
|
||||
|
||||
for grid_item in MEDIA_CACHE.fs.find({"remote_id": remote_id}):
|
||||
MEDIA_CACHE.fs.delete(grid_item._id)
|
||||
|
||||
# TODO(tsileo): here for legacy reason, this needs to be removed at some point
|
||||
for grid_item in MEDIA_CACHE.fs.find({"remote_id": object_id}):
|
||||
MEDIA_CACHE.fs.delete(grid_item._id)
|
||||
|
||||
# Delete the activity
|
||||
DB.activities.delete_one({"_id": data["_id"]})
|
||||
|
||||
announce_deleted += 1
|
||||
except Exception:
|
||||
logger.exception(f"failed to process {data!r}")
|
||||
|
||||
after_gc_announce = perf_counter()
|
||||
time_to_gc_announce = after_gc_announce - after_gc_create
|
||||
logger.info(
|
||||
f"{time_to_gc_announce:.2f} seconds to analyze {announce_count} Announce, {announce_deleted} deleted"
|
||||
)
|
Reference in New Issue
Block a user