Log settings changes and ignores

JustAnotherArchivist · JustAnotherArchivist · commit 3e62eb4f014e · 2022-05-07T02:13:02.000Z
diff --git a/libgrabsite/wpull_hooks.py b/libgrabsite/wpull_hooks.py
@@ -9,6 +9,7 @@
 import traceback
 import asyncio
 import urllib.parse
+import logging
 
 from wpull.application.hook import Actions
 from wpull.application.plugin import WpullPlugin, PluginFunctions, hook, event
@@ -146,13 +147,15 @@ def activate(self):
 		self.loop = asyncio.get_event_loop()
 		self.enable_stdio_capture()
 		self.add_signal_handlers()
+		self.logger = logging.getLogger("grab_site.wpull_plugin")
 		self.init_job_data()
 		self.init_ws()
 		self.setup_watchers()
 		self.all_start_urls             = open(cf("all_start_urls")).read().rstrip("\n").split("\n")
 		self.all_start_netlocs          = set(urllib.parse.urlparse(url).netloc for url in self.all_start_urls)
 		self.skipped_videos             = open(cf("skipped_videos"),             "w", encoding="utf-8")
 		self.skipped_max_content_length = open(cf("skipped_max_content_length"), "w", encoding="utf-8")
+		self.compiled_ignores = []
 		self.update_ignores()
 		super().activate()
 
@@ -255,6 +258,7 @@ def update_max_content_length(self):
 			return
 		with open(self.watchers["max_content_length"].fname, "r") as f:
 			self.job_data["max_content_length"] = int(f.read().strip())
+		self.logger.info(f"Settings change: max_content_length = {self.job_data['max_content_length']}")
 
 	@swallow_exception
 	def update_delay(self):
@@ -266,6 +270,8 @@ def update_delay(self):
 				self.job_data["delay_min"], self.job_data["delay_max"] = list(int(s) for s in content.split("-", 1))
 			else:
 				self.job_data["delay_min"] = self.job_data["delay_max"] = int(content)
+		max_string = f"-{self.job_data['delay_max']}" if self.job_data["delay_min"] != self.job_data["delay_max"] else ""
+		self.logger.info(f"Settings change: delay = {self.job_data['delay_min']}{max_string}")
 
 	@swallow_exception
 	def update_concurrency(self):
@@ -278,6 +284,7 @@ def update_concurrency(self):
 				concurrency = 1
 			self.job_data["concurrency"] = concurrency
 		self.app_session.factory["PipelineSeries"].concurrency = concurrency
+		self.logger.info(f"Settings change: concurrency = {concurrency}")
 
 	stop_path = cf("stop")
 	def should_stop(self):
@@ -298,6 +305,9 @@ def update_video(self):
 	@swallow_exception
 	def update_scrape(self):
 		scrape = path_exists_with_cache(self.scrape_path)
+		if scrape == self.job_data["scrape"]:
+			return
+		self.logger.info(f"Settings change: scrape = {scrape}")
 		self.job_data["scrape"] = scrape
 		if not scrape:
 			# Empty the list of scrapers, which will stop scraping for new URLs
@@ -329,6 +339,15 @@ def update_ignores(self):
 		for ig in sorted(ignores):
 			self.print_to_terminal(f"\t{ig}")
 
+		# Log changes
+		old_ignores = set(x[0] for x in self.compiled_ignores)
+		added_ignores = ignores - old_ignores
+		removed_ignores = old_ignores - ignores
+		for ig in added_ignores:
+			self.logger.info(f"Adding ignore: {ig}")
+		for ig in removed_ignores:
+			self.logger.info(f"Removing ignore: {ig}")
+
 		self.compiled_ignores       = [(ig, re_compile(ig)) for ig in ignores]
 		self.combined_ignore_regexp = compile_combined_regexp(ignores)
 
@@ -366,7 +385,9 @@ def accept_url(self, item_session: ItemSession, verdict: bool, reasons: dict):
 		if should_ignore:
 			if not self.job_data["suppress_ignore_reports"]:
 				pattern = self.get_specific_ignore_pattern(url)
-				self.maybe_log_ignore(url, pattern)
+			else:
+				pattern = "[ignore pattern match]"
+			self.maybe_log_ignore(url, pattern)
 			return False
 
 		# If we get here, none of our ignores apply. Return the original verdict.
@@ -405,6 +426,7 @@ def handle_result(self, url_info, record_info, error_info, response):
 		return Actions.NORMAL
 
 	def maybe_log_ignore(self, url, pattern):
+		self.logger.info(f"Ignoring ‘{url}’: {pattern}")
 		if not self.job_data["suppress_ignore_reports"]:
 			self.print_to_terminal(f"IGNOR {url}\n   by {pattern}")
 			self.put_ws_queue({