improved filters
This commit is contained in:
parent
ab788ed379
commit
e000f05df2
@ -4,6 +4,7 @@ Loading this file via `commands.Bot.load_extension` will add `TaskCog` to the bo
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import datetime
|
import datetime
|
||||||
from os import getenv
|
from os import getenv
|
||||||
@ -18,6 +19,7 @@ from feedparser import parse
|
|||||||
|
|
||||||
from feed import RSSFeed, Subscription, RSSItem
|
from feed import RSSFeed, Subscription, RSSItem
|
||||||
from utils import get_unparsed_feed
|
from utils import get_unparsed_feed
|
||||||
|
from filters import match_text
|
||||||
from api import API
|
from api import API
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
@ -122,6 +124,8 @@ class TaskCog(commands.Cog):
|
|||||||
channels = [self.bot.get_channel(channel.channel_id) for channel in await sub.get_channels(api)]
|
channels = [self.bot.get_channel(channel.channel_id) for channel in await sub.get_channels(api)]
|
||||||
filters = [await api.get_filter(filter_id) for filter_id in sub.filters]
|
filters = [await api.get_filter(filter_id) for filter_id in sub.filters]
|
||||||
|
|
||||||
|
log.debug(json.dumps(filters, indent=4))
|
||||||
|
|
||||||
for item in feed.items:
|
for item in feed.items:
|
||||||
log.debug("processing item '%s'", item.guid)
|
log.debug("processing item '%s'", item.guid)
|
||||||
|
|
||||||
@ -129,7 +133,9 @@ class TaskCog(commands.Cog):
|
|||||||
log.debug("item '%s' older than subscription threshold '%s', skipping", item.pub_date, sub.published_threshold)
|
log.debug("item '%s' older than subscription threshold '%s', skipping", item.pub_date, sub.published_threshold)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
log.debug("before")
|
||||||
blocked = any(self.filter_item(_filter, item) for _filter in filters)
|
blocked = any(self.filter_item(_filter, item) for _filter in filters)
|
||||||
|
log.debug("after")
|
||||||
mutated_item = item.create_mutated_copy(sub.mutators)
|
mutated_item = item.create_mutated_copy(sub.mutators)
|
||||||
|
|
||||||
for channel in channels:
|
for channel in channels:
|
||||||
@ -140,29 +146,19 @@ class TaskCog(commands.Cog):
|
|||||||
|
|
||||||
def filter_item(self, _filter: dict, item: RSSItem) -> bool:
|
def filter_item(self, _filter: dict, item: RSSItem) -> bool:
|
||||||
"""
|
"""
|
||||||
Returns True if item should be ignored due to filters.
|
Returns `True` if item should be ignored due to filters.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
match_found = False # This is the flag to determine if the content should be filtered
|
log.debug("checking filter")
|
||||||
|
|
||||||
keywords = _filter["keywords"].split(",")
|
try:
|
||||||
regex_pattern = _filter["regex"]
|
match_found = match_text(_fliter, item.title) or match_text(_filter, item.description)
|
||||||
is_whitelist = _filter["whitelist"]
|
log.debug("filter match found? '%s'", match_found)
|
||||||
|
return match_found
|
||||||
log.debug(
|
except Exception as error:
|
||||||
"trying filter '%s', keyword '%s', regex '%s', is whitelist: '%s'",
|
log.error(error)
|
||||||
_filter["name"], keywords, regex_pattern, is_whitelist
|
input("[paused] >")
|
||||||
)
|
return False
|
||||||
|
|
||||||
assert not (keywords and regex_pattern), "Keywords and Regex used, only 1 can be used."
|
|
||||||
|
|
||||||
if regex_pattern:
|
|
||||||
regex = re.compile(regex_pattern)
|
|
||||||
match_found = regex.search(item.title) or regex.search(item.description)
|
|
||||||
else:
|
|
||||||
match_found = any(word in item.title or word in item.description for word in keywords)
|
|
||||||
|
|
||||||
return not match_found if is_whitelist else match_found
|
|
||||||
|
|
||||||
async def mark_tracked_item(self, api: API, sub: Subscription, item: RSSItem, channel_id: int, blocked: bool):
|
async def mark_tracked_item(self, api: API, sub: Subscription, item: RSSItem, channel_id: int, blocked: bool):
|
||||||
try:
|
try:
|
||||||
|
102
src/filters.py
Normal file
102
src/filters.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
def _split_match(_match):
|
||||||
|
"""
|
||||||
|
Splits the match to individual keywords, getting rid of unnecessary
|
||||||
|
spaces and grouping quoted words together.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
' some random words "with quotes " and spaces'
|
||||||
|
==>
|
||||||
|
["some", "random", "words", "with+quotes", "and", "spaces"]
|
||||||
|
"""
|
||||||
|
findterms = re.compile(r'"([^"]+)"|(\S+)').findall
|
||||||
|
normspace = re.compile(r"\s+").sub
|
||||||
|
return [
|
||||||
|
# normspace(" ", (t[0] or t[1]).strip()).replace(" ", r"\s+")
|
||||||
|
re.escape(normspace(" ", (t[0] or t[1]).strip())).replace(r"\ ", r"\s+")
|
||||||
|
for t in findterms(_match)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _match_any(_match: str, matching_to: str, **search_kwargs) -> bool:
|
||||||
|
for word in _split_match(_match):
|
||||||
|
if re.search(rf"\b{word}\b", matching_to, **search_kwargs):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _match_all(_match: str, matching_to: str, **search_kwargs) -> bool:
|
||||||
|
for word in _split_match(_match):
|
||||||
|
if not re.search(rf"\b{word}\b", matching_to, **search_kwargs):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _match_literal(_match: str, matching_to: str, **search_kwargs) -> bool:
|
||||||
|
return bool(
|
||||||
|
re.search(
|
||||||
|
rf"\b{re.escape(_match)}\b",
|
||||||
|
matching_to,
|
||||||
|
**search_kwargs,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _match_regex(_match: str, matching_to: str, **search_kwargs) -> bool:
|
||||||
|
try:
|
||||||
|
return bool(re.search(
|
||||||
|
re.compile(_match, **search_kwargs),
|
||||||
|
matching_to,
|
||||||
|
))
|
||||||
|
except re.error as err:
|
||||||
|
log.error(err)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _match_fuzzy(_match: str, matching_to: str, **search_kwargs) -> bool:
|
||||||
|
from rapidfuzz import fuzz
|
||||||
|
|
||||||
|
_match = re.sub(r"[^\w\s]", "", _match)
|
||||||
|
text = re.sub(r"[^\w\s]", "", matching_to)
|
||||||
|
if is_insensitive:
|
||||||
|
_match = _match.lower()
|
||||||
|
text = text.lower()
|
||||||
|
|
||||||
|
return fuzz.partial_ratio(_match, text, score_cutoff=90)
|
||||||
|
|
||||||
|
def match_text(_filter: dict, matching_to: str) -> bool:
|
||||||
|
search_kwargs = {}
|
||||||
|
|
||||||
|
algorithm = _filter["matching_algorithm"]
|
||||||
|
_match = _filter["match"]
|
||||||
|
is_whitelist = _filter["is_whitelist"]
|
||||||
|
is_insensitive = _filter["is_insensitive"]
|
||||||
|
|
||||||
|
# If the matching pattern is empty
|
||||||
|
if not _match.strip():
|
||||||
|
return False
|
||||||
|
|
||||||
|
if is_insensitive:
|
||||||
|
search_kwargs = {"flags": re.IGNORECASE}
|
||||||
|
|
||||||
|
log.debug(
|
||||||
|
"matching algorithm '%s', whitelist? '%s', insensitive? '%s'",
|
||||||
|
algorithm, is_whitelist, is_insensitive
|
||||||
|
)
|
||||||
|
|
||||||
|
match algorithm:
|
||||||
|
case 0:
|
||||||
|
return False
|
||||||
|
|
||||||
|
case 1: # Any
|
||||||
|
return _match_any(_match, matching_to, **search_kwargs)
|
||||||
|
|
||||||
|
case 2: # All
|
||||||
|
return _match_all(_match, matching_to, **search_kwargs)
|
||||||
|
|
||||||
|
case 3: # Exact Match
|
||||||
|
return _match_literal(_match, matching_to, **search_kwargs)
|
||||||
|
|
||||||
|
case 4: # Regular Expression
|
||||||
|
return _match_regex(_match, matching_to, **search_kwargs)
|
||||||
|
|
||||||
|
case 5: # Fuzzy Match
|
||||||
|
return _match_fuzzy(_match, matching_to, **search_kwargs)
|
Loading…
x
Reference in New Issue
Block a user