Skip to content

Commit 3e62eb4

Browse files
Log settings changes and ignores
1 parent 9e6e95b commit 3e62eb4

File tree

1 file changed

+23
-1
lines changed

1 file changed

+23
-1
lines changed

libgrabsite/wpull_hooks.py

+23-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import traceback
1010
import asyncio
1111
import urllib.parse
12+
import logging
1213

1314
from wpull.application.hook import Actions
1415
from wpull.application.plugin import WpullPlugin, PluginFunctions, hook, event
@@ -146,13 +147,15 @@ def activate(self):
146147
self.loop = asyncio.get_event_loop()
147148
self.enable_stdio_capture()
148149
self.add_signal_handlers()
150+
self.logger = logging.getLogger("grab_site.wpull_plugin")
149151
self.init_job_data()
150152
self.init_ws()
151153
self.setup_watchers()
152154
self.all_start_urls = open(cf("all_start_urls")).read().rstrip("\n").split("\n")
153155
self.all_start_netlocs = set(urllib.parse.urlparse(url).netloc for url in self.all_start_urls)
154156
self.skipped_videos = open(cf("skipped_videos"), "w", encoding="utf-8")
155157
self.skipped_max_content_length = open(cf("skipped_max_content_length"), "w", encoding="utf-8")
158+
self.compiled_ignores = []
156159
self.update_ignores()
157160
super().activate()
158161

@@ -255,6 +258,7 @@ def update_max_content_length(self):
255258
return
256259
with open(self.watchers["max_content_length"].fname, "r") as f:
257260
self.job_data["max_content_length"] = int(f.read().strip())
261+
self.logger.info(f"Settings change: max_content_length = {self.job_data['max_content_length']}")
258262

259263
@swallow_exception
260264
def update_delay(self):
@@ -266,6 +270,8 @@ def update_delay(self):
266270
self.job_data["delay_min"], self.job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
267271
else:
268272
self.job_data["delay_min"] = self.job_data["delay_max"] = int(content)
273+
max_string = f"-{self.job_data['delay_max']}" if self.job_data["delay_min"] != self.job_data["delay_max"] else ""
274+
self.logger.info(f"Settings change: delay = {self.job_data['delay_min']}{max_string}")
269275

270276
@swallow_exception
271277
def update_concurrency(self):
@@ -278,6 +284,7 @@ def update_concurrency(self):
278284
concurrency = 1
279285
self.job_data["concurrency"] = concurrency
280286
self.app_session.factory["PipelineSeries"].concurrency = concurrency
287+
self.logger.info(f"Settings change: concurrency = {concurrency}")
281288

282289
stop_path = cf("stop")
283290
def should_stop(self):
@@ -298,6 +305,9 @@ def update_video(self):
298305
@swallow_exception
299306
def update_scrape(self):
300307
scrape = path_exists_with_cache(self.scrape_path)
308+
if scrape == self.job_data["scrape"]:
309+
return
310+
self.logger.info(f"Settings change: scrape = {scrape}")
301311
self.job_data["scrape"] = scrape
302312
if not scrape:
303313
# Empty the list of scrapers, which will stop scraping for new URLs
@@ -329,6 +339,15 @@ def update_ignores(self):
329339
for ig in sorted(ignores):
330340
self.print_to_terminal(f"\t{ig}")
331341

342+
# Log changes
343+
old_ignores = set(x[0] for x in self.compiled_ignores)
344+
added_ignores = ignores - old_ignores
345+
removed_ignores = old_ignores - ignores
346+
for ig in added_ignores:
347+
self.logger.info(f"Adding ignore: {ig}")
348+
for ig in removed_ignores:
349+
self.logger.info(f"Removing ignore: {ig}")
350+
332351
self.compiled_ignores = [(ig, re_compile(ig)) for ig in ignores]
333352
self.combined_ignore_regexp = compile_combined_regexp(ignores)
334353

@@ -366,7 +385,9 @@ def accept_url(self, item_session: ItemSession, verdict: bool, reasons: dict):
366385
if should_ignore:
367386
if not self.job_data["suppress_ignore_reports"]:
368387
pattern = self.get_specific_ignore_pattern(url)
369-
self.maybe_log_ignore(url, pattern)
388+
else:
389+
pattern = "[ignore pattern match]"
390+
self.maybe_log_ignore(url, pattern)
370391
return False
371392

372393
# If we get here, none of our ignores apply. Return the original verdict.
@@ -405,6 +426,7 @@ def handle_result(self, url_info, record_info, error_info, response):
405426
return Actions.NORMAL
406427

407428
def maybe_log_ignore(self, url, pattern):
429+
self.logger.info(f"Ignoring ‘{url}’: {pattern}")
408430
if not self.job_data["suppress_ignore_reports"]:
409431
self.print_to_terminal(f"IGNOR {url}\n by {pattern}")
410432
self.put_ws_queue({

0 commit comments

Comments
 (0)