Skip to content

Log settings changes and ignores #221

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion libgrabsite/wpull_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import traceback
import asyncio
import urllib.parse
import logging

from wpull.application.hook import Actions
from wpull.application.plugin import WpullPlugin, PluginFunctions, hook, event
Expand Down Expand Up @@ -146,13 +147,15 @@ def activate(self):
self.loop = asyncio.get_event_loop()
self.enable_stdio_capture()
self.add_signal_handlers()
self.logger = logging.getLogger("grab_site.wpull_plugin")
self.init_job_data()
self.init_ws()
self.setup_watchers()
self.all_start_urls = open(cf("all_start_urls")).read().rstrip("\n").split("\n")
self.all_start_netlocs = set(urllib.parse.urlparse(url).netloc for url in self.all_start_urls)
self.skipped_videos = open(cf("skipped_videos"), "w", encoding="utf-8")
self.skipped_max_content_length = open(cf("skipped_max_content_length"), "w", encoding="utf-8")
self.compiled_ignores = []
self.update_ignores()
super().activate()

Expand Down Expand Up @@ -255,6 +258,7 @@ def update_max_content_length(self):
return
with open(self.watchers["max_content_length"].fname, "r") as f:
self.job_data["max_content_length"] = int(f.read().strip())
self.logger.info(f"Settings change: max_content_length = {self.job_data['max_content_length']}")

@swallow_exception
def update_delay(self):
Expand All @@ -266,6 +270,8 @@ def update_delay(self):
self.job_data["delay_min"], self.job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
else:
self.job_data["delay_min"] = self.job_data["delay_max"] = int(content)
max_string = f"-{self.job_data['delay_max']}" if self.job_data["delay_min"] != self.job_data["delay_max"] else ""
self.logger.info(f"Settings change: delay = {self.job_data['delay_min']}{max_string}")

@swallow_exception
def update_concurrency(self):
Expand All @@ -278,6 +284,7 @@ def update_concurrency(self):
concurrency = 1
self.job_data["concurrency"] = concurrency
self.app_session.factory["PipelineSeries"].concurrency = concurrency
self.logger.info(f"Settings change: concurrency = {concurrency}")

stop_path = cf("stop")
def should_stop(self):
Expand All @@ -298,6 +305,9 @@ def update_video(self):
@swallow_exception
def update_scrape(self):
scrape = path_exists_with_cache(self.scrape_path)
if scrape == self.job_data["scrape"]:
return
self.logger.info(f"Settings change: scrape = {scrape}")
self.job_data["scrape"] = scrape
if not scrape:
# Empty the list of scrapers, which will stop scraping for new URLs
Expand Down Expand Up @@ -329,6 +339,15 @@ def update_ignores(self):
for ig in sorted(ignores):
self.print_to_terminal(f"\t{ig}")

# Log changes
old_ignores = set(x[0] for x in self.compiled_ignores)
added_ignores = ignores - old_ignores
removed_ignores = old_ignores - ignores
for ig in added_ignores:
self.logger.info(f"Adding ignore: {ig}")
for ig in removed_ignores:
self.logger.info(f"Removing ignore: {ig}")

self.compiled_ignores = [(ig, re_compile(ig)) for ig in ignores]
self.combined_ignore_regexp = compile_combined_regexp(ignores)

Expand Down Expand Up @@ -366,7 +385,9 @@ def accept_url(self, item_session: ItemSession, verdict: bool, reasons: dict):
if should_ignore:
if not self.job_data["suppress_ignore_reports"]:
pattern = self.get_specific_ignore_pattern(url)
self.maybe_log_ignore(url, pattern)
else:
pattern = "[ignore pattern match]"
self.maybe_log_ignore(url, pattern)
return False

# If we get here, none of our ignores apply. Return the original verdict.
Expand Down Expand Up @@ -405,6 +426,7 @@ def handle_result(self, url_info, record_info, error_info, response):
return Actions.NORMAL

def maybe_log_ignore(self, url, pattern):
self.logger.info(f"Ignoring ‘{url}’: {pattern}")
if not self.job_data["suppress_ignore_reports"]:
self.print_to_terminal(f"IGNOR {url}\n by {pattern}")
self.put_ws_queue({
Expand Down