duplicate content enforcement
Some checks failed
Build and Push Docker Image / build (push) Failing after 7m4s
Some checks failed
Build and Push Docker Image / build (push) Failing after 7m4s
This commit is contained in:
parent
1f9075ce60
commit
81795feb65
@ -109,7 +109,7 @@ class TaskCog(commands.Cog):
|
||||
await do_batch_job(servers, self.process_server, 10)
|
||||
|
||||
end_time = perf_counter()
|
||||
log.debug(f"completed task in {end_time - start_time:.4f} seconds")
|
||||
log.info(f"completed task in {end_time - start_time:.4f} seconds")
|
||||
|
||||
async def iterate_pages(self, url: str, params: dict={}):
|
||||
|
||||
@ -147,6 +147,28 @@ class TaskCog(commands.Cog):
|
||||
|
||||
return models.Subscription.from_list(subscriptions)
|
||||
|
||||
async def get_contents(self, subscription: models.Subscription, raw_rss_content: dict):
|
||||
contents = await models.Content.from_raw_rss(raw_rss_content, subscription, self.client)
|
||||
duplicate_contents = []
|
||||
|
||||
async def check_duplicate_content(content: models.Content):
|
||||
exists = await content.exists_via_api(
|
||||
url=self.api_base_url + "content/",
|
||||
headers=self.api_headers,
|
||||
client=self.client
|
||||
)
|
||||
|
||||
if exists:
|
||||
log.debug(f"Removing duplicate {content}")
|
||||
duplicate_contents.append(content)
|
||||
|
||||
await do_batch_job(contents, check_duplicate_content, 15)
|
||||
|
||||
for duplicate in duplicate_contents:
|
||||
contents.remove(duplicate)
|
||||
|
||||
return contents
|
||||
|
||||
async def process_server(self, server: models.Server):
|
||||
log.debug(f"processing server: {server.name}")
|
||||
start_time = perf_counter()
|
||||
@ -168,42 +190,17 @@ class TaskCog(commands.Cog):
|
||||
if not raw_rss_content:
|
||||
return
|
||||
|
||||
contents = await models.Content.from_raw_rss(raw_rss_content, subscription, self.client)
|
||||
|
||||
async def check_duplicate_content(content: models.Content):
|
||||
params = {
|
||||
"match_any": True, # allows any param to match, instead of needing all
|
||||
"item_id": content.item_id,
|
||||
"item_guid": content.item_guid,
|
||||
"item_url": content.item_url,
|
||||
"item_title": content.item_title,
|
||||
"item_content_hash": content.item_content_hash,
|
||||
"subscription": content.subscription_id
|
||||
}
|
||||
|
||||
try:
|
||||
response = await self.client.get(
|
||||
self.api_base_url + f"content/",
|
||||
headers=self.api_headers,
|
||||
params=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
if len(response.json().get("results", [])):
|
||||
log.debug("found duplicate")
|
||||
contents.remove(content)
|
||||
except httpx.HTTPError as exc:
|
||||
log.error(f"assuming not duplicate {exc}")
|
||||
|
||||
# clear duplicate content
|
||||
log.debug(f"checking for duplicates (count: {len(contents)})")
|
||||
await do_batch_job(contents, check_duplicate_content, 15)
|
||||
log.debug(f"finished looking for duplicates (count: {len(contents)})")
|
||||
contents = await self.get_contents(subscription, raw_rss_content)
|
||||
if not contents:
|
||||
log.debug("no contents to process")
|
||||
return
|
||||
|
||||
channels = await subscription.get_discord_channels(self.bot)
|
||||
valid_contents, invalid_contents = subscription.filter_entries(contents)
|
||||
|
||||
async def send_content(channel: discord.TextChannel):
|
||||
# BUG: I believe there are duplicate embeds here
|
||||
# discord only shows 1 when urls are matching, but merges images from both into the 1
|
||||
embeds = [content.embed for content in valid_contents]
|
||||
batch_size = 10
|
||||
for i in range(0, len(embeds), batch_size):
|
||||
|
@ -217,18 +217,6 @@ class MessageStyle(DjangoDataModel):
|
||||
return item
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class UniqueContentRule(DjangoDataModel):
|
||||
id: int
|
||||
name: str
|
||||
value: str
|
||||
|
||||
@staticmethod
|
||||
def parser(item: dict) -> dict:
|
||||
item["id"] = int(item.pop("id"))
|
||||
return item
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class DiscordChannel(DjangoDataModel):
|
||||
id: int
|
||||
@ -255,7 +243,6 @@ class Subscription(DjangoDataModel):
|
||||
channels: list[DiscordChannel]
|
||||
filters: list[ContentFilter]
|
||||
message_style: MessageStyle
|
||||
unique_rules: UniqueContentRule
|
||||
_server: Server | None = None
|
||||
|
||||
@staticmethod
|
||||
@ -268,7 +255,6 @@ class Subscription(DjangoDataModel):
|
||||
item["channels"] = DiscordChannel.from_list(item.pop("channels_detail"))
|
||||
item["filters"] = ContentFilter.from_list(item.pop("filters_detail"))
|
||||
item["message_style"] = MessageStyle.from_dict(item.pop("message_style_detail"))
|
||||
item["unique_rules"] = UniqueContentRule.from_list(item.pop("unique_rules_detail"))
|
||||
return item
|
||||
|
||||
@property
|
||||
@ -358,6 +344,53 @@ class Content(DjangoDataModel):
|
||||
item["subscription_id"] = item.pop("subscription")
|
||||
return item
|
||||
|
||||
async def exists_via_api(self, url: str, headers: dict, client: httpx.AsyncClient):
|
||||
log.debug(f"checking if {self.item_content_hash} exists via API")
|
||||
params = {
|
||||
"match_any": True, # allows any param to match, instead of needing all
|
||||
"item_id": self.item_id,
|
||||
"item_guid": self.item_guid,
|
||||
"item_url": self.item_url,
|
||||
"item_title": self.item_title,
|
||||
"item_content_hash": self.item_content_hash,
|
||||
"subscription": self.subscription_id
|
||||
}
|
||||
|
||||
try:
|
||||
response = await client.get(
|
||||
url=url,
|
||||
headers=headers,
|
||||
params=params
|
||||
)
|
||||
response.raise_for_status()
|
||||
except httpx.HTTPError as exc:
|
||||
log.error(f"assuming not duplicate due to error: {exc}")
|
||||
return False
|
||||
|
||||
return response.json().get("results", [])
|
||||
|
||||
def is_duplicate(self, other):
|
||||
if not isinstance(other, Content):
|
||||
raise ValueError(f"Expected Content, received {type(other)}")
|
||||
|
||||
other_details = other.duplicate_details
|
||||
return any(
|
||||
other_details.get(key) == value
|
||||
for key, value in self.duplicate_details.items()
|
||||
)
|
||||
|
||||
@property
|
||||
def duplicate_details(self):
|
||||
keys = [
|
||||
"item_id",
|
||||
"item_guid",
|
||||
"item_url",
|
||||
"item_title",
|
||||
"item_content_hash"
|
||||
]
|
||||
data = asdict(self)
|
||||
return { key: data[key] for key in keys }
|
||||
|
||||
async def save(self, client: httpx.AsyncClient, base_url: str, headers: dict):
|
||||
log.debug(f"saving content {self.item_content_hash}")
|
||||
|
||||
@ -373,8 +406,8 @@ class Content(DjangoDataModel):
|
||||
headers=headers,
|
||||
data=data
|
||||
)
|
||||
log.debug(response.text)
|
||||
response.raise_for_status()
|
||||
log.debug(f"save success for {self.item_content_hash}")
|
||||
|
||||
@classmethod
|
||||
async def from_raw_rss(cls, rss: str, subscription: Subscription, client: httpx.AsyncClient):
|
||||
@ -383,6 +416,13 @@ class Content(DjangoDataModel):
|
||||
contents = []
|
||||
|
||||
async def create_content(entry: feedparser.FeedParserDict):
|
||||
published = entry.get("published_parsed")
|
||||
published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc)
|
||||
|
||||
if published < subscription.publish_threshold:
|
||||
log.debug("skipping due to publish threshold")
|
||||
return
|
||||
|
||||
content_hash = hashlib.new("sha256")
|
||||
content_hash.update(entry.get("description", "").encode())
|
||||
|
||||
@ -391,9 +431,6 @@ class Content(DjangoDataModel):
|
||||
if style.fetch_images:
|
||||
item_image_url = await cls.get_image_url(item_url, client)
|
||||
|
||||
published = entry.get("published_parsed")
|
||||
published = datetime(*published[0:6] if published else None, tzinfo=timezone.utc)
|
||||
|
||||
content = Content.from_dict({
|
||||
"id": -1,
|
||||
"subscription": subscription.id,
|
||||
@ -412,6 +449,12 @@ class Content(DjangoDataModel):
|
||||
"item_feed_url": parsed_rss.get("feed", {}).get("link")
|
||||
})
|
||||
|
||||
# Weed out duplicates
|
||||
log.debug("weeding out duplicates")
|
||||
if any(content.is_duplicate(other) for other in contents):
|
||||
log.debug("found duplicate while loading rss data")
|
||||
return
|
||||
|
||||
content.subscription = subscription
|
||||
contents.append(content)
|
||||
|
||||
@ -423,7 +466,11 @@ class Content(DjangoDataModel):
|
||||
async def get_image_url(url: str, client: httpx.AsyncClient) -> str | None:
|
||||
log.debug("Fetching image url")
|
||||
|
||||
response = await client.get(url, timeout=15)
|
||||
try:
|
||||
response = await client.get(url, timeout=15)
|
||||
except httpx.HTTPError:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
image_element = soup.select_one("meta[property='og:image']")
|
||||
if not image_element:
|
||||
@ -470,4 +517,6 @@ class Content(DjangoDataModel):
|
||||
)
|
||||
embed.set_footer(text=self.subscription.name)
|
||||
|
||||
log.debug(f"created embed: {embed.to_dict()}")
|
||||
|
||||
return embed
|
||||
|
111
src/tests.py
111
src/tests.py
@ -1,8 +1,9 @@
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def test_content_filters():
|
||||
"""
|
||||
In this test, a content filter is created and used to filter some data.
|
||||
"""
|
||||
|
||||
from models import ContentFilter, MatchingAlgorithm
|
||||
|
||||
content_filter = ContentFilter(
|
||||
@ -26,8 +27,110 @@ def test_content_filters():
|
||||
print("success")
|
||||
|
||||
|
||||
def test_content_duplicates():
|
||||
"""
|
||||
In this test, two almost but not quite identical instances of `Content` are created, and
|
||||
checked against each other as duplicates.
|
||||
|
||||
They should be considered duplicate, because not all fields need to match in order to be considered a duplicate.
|
||||
|
||||
The provided data is from a real world used example, where what should have been considered a duplicate was missed.
|
||||
"""
|
||||
|
||||
from models import Content
|
||||
from datetime import datetime
|
||||
|
||||
datetime_now = datetime.now()
|
||||
|
||||
first_content = Content(
|
||||
id=0,
|
||||
subscription_id=38,
|
||||
item_id="https://www.bbc.com/news/articles/ced9l7799w9o#0",
|
||||
item_guid="https://www.bbc.com/news/articles/ced9l7799w9o#0",
|
||||
item_url="https://www.bbc.com/news/articles/ced9l7799w9o",
|
||||
item_title="Spain's PM orders 10,000 troops and police to Valencia",
|
||||
item_description="",
|
||||
item_content_hash="4a2ba8429a0584ce08f521db3f6d3000c248467f2cc6fa4b0458808169247ad8",
|
||||
item_image_url="",
|
||||
item_thumbnail_url="",
|
||||
item_published=datetime_now,
|
||||
item_author="",
|
||||
item_author_url="",
|
||||
item_feed_title="",
|
||||
item_feed_url="",
|
||||
blocked=False,
|
||||
)
|
||||
|
||||
second_content = Content(
|
||||
id=1,
|
||||
subscription_id=38,
|
||||
item_id="https://www.bbc.com/news/articles/ced9l7799w9o#0",
|
||||
item_guid="https://www.bbc.com/news/articles/ced9l7799w9o#0",
|
||||
item_url="https://www.bbc.com/news/articles/ced9l7799w9o",
|
||||
item_title="Spain's PM orders 10,000 troops and police to flood-hit Valencia",
|
||||
item_description="",
|
||||
item_content_hash="4a2ba8429a0584ce08f521db3f6d3000c248467f2cc6fa4b0458808169247ad8",
|
||||
item_image_url="",
|
||||
item_thumbnail_url="",
|
||||
item_published=datetime_now,
|
||||
item_author="",
|
||||
item_author_url="",
|
||||
item_feed_title="",
|
||||
item_feed_url="",
|
||||
blocked=False,
|
||||
)
|
||||
|
||||
assert first_content.is_duplicate(second_content), "Content is not considered a duplicate"
|
||||
|
||||
print("1 success")
|
||||
|
||||
# BUG: This one is identified but still gets processed...
|
||||
|
||||
third_content = Content(
|
||||
id=0,
|
||||
subscription_id=38,
|
||||
item_id="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
|
||||
item_guid="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
|
||||
item_url="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o",
|
||||
item_title="Sao Paulo GP qualifying set for Sunday after rain postponement",
|
||||
item_description="",
|
||||
item_content_hash="b6c78de554a183cfeca88decf987401719d431647523f038a86fd7d972e4e799",
|
||||
item_image_url="",
|
||||
item_thumbnail_url="",
|
||||
item_published=datetime_now,
|
||||
item_author="",
|
||||
item_author_url="",
|
||||
item_feed_title="",
|
||||
item_feed_url="",
|
||||
blocked=False,
|
||||
)
|
||||
|
||||
fourth_content = Content(
|
||||
id=0,
|
||||
subscription_id=38,
|
||||
item_id="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
|
||||
item_guid="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o#8",
|
||||
item_url="https://www.bbc.com/sport/formula1/articles/cdd0ey1v5j9o",
|
||||
item_title="Sao Paulo GP qualifying set for Sunday after rain postponement",
|
||||
item_description="",
|
||||
item_content_hash="6ddd15d7d9626f2d63ba5631056fda9bcaf920e8c82ec5c23fa824b02ce690d0",
|
||||
item_image_url="",
|
||||
item_thumbnail_url="",
|
||||
item_published=datetime_now,
|
||||
item_author="",
|
||||
item_author_url="",
|
||||
item_feed_title="",
|
||||
item_feed_url="",
|
||||
blocked=False,
|
||||
)
|
||||
|
||||
assert third_content.is_duplicate(fourth_content)
|
||||
|
||||
print("2 success")
|
||||
|
||||
def main():
|
||||
test_content_filters()
|
||||
# test_content_filters()
|
||||
test_content_duplicates()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
Loading…
x
Reference in New Issue
Block a user