9
9
import traceback
10
10
import asyncio
11
11
import urllib .parse
12
+ import logging
12
13
13
14
from wpull .application .hook import Actions
14
15
from wpull .application .plugin import WpullPlugin , PluginFunctions , hook , event
@@ -146,13 +147,15 @@ def activate(self):
146
147
self .loop = asyncio .get_event_loop ()
147
148
self .enable_stdio_capture ()
148
149
self .add_signal_handlers ()
150
+ self .logger = logging .getLogger ("grab_site.wpull_plugin" )
149
151
self .init_job_data ()
150
152
self .init_ws ()
151
153
self .setup_watchers ()
152
154
self .all_start_urls = open (cf ("all_start_urls" )).read ().rstrip ("\n " ).split ("\n " )
153
155
self .all_start_netlocs = set (urllib .parse .urlparse (url ).netloc for url in self .all_start_urls )
154
156
self .skipped_videos = open (cf ("skipped_videos" ), "w" , encoding = "utf-8" )
155
157
self .skipped_max_content_length = open (cf ("skipped_max_content_length" ), "w" , encoding = "utf-8" )
158
+ self .compiled_ignores = []
156
159
self .update_ignores ()
157
160
super ().activate ()
158
161
@@ -255,6 +258,7 @@ def update_max_content_length(self):
255
258
return
256
259
with open (self .watchers ["max_content_length" ].fname , "r" ) as f :
257
260
self .job_data ["max_content_length" ] = int (f .read ().strip ())
261
+ self .logger .info (f"Settings change: max_content_length = { self .job_data ['max_content_length' ]} " )
258
262
259
263
@swallow_exception
260
264
def update_delay (self ):
@@ -266,6 +270,8 @@ def update_delay(self):
266
270
self .job_data ["delay_min" ], self .job_data ["delay_max" ] = list (int (s ) for s in content .split ("-" , 1 ))
267
271
else :
268
272
self .job_data ["delay_min" ] = self .job_data ["delay_max" ] = int (content )
273
+ max_string = f"-{ self .job_data ['delay_max' ]} " if self .job_data ["delay_min" ] != self .job_data ["delay_max" ] else ""
274
+ self .logger .info (f"Settings change: delay = { self .job_data ['delay_min' ]} { max_string } " )
269
275
270
276
@swallow_exception
271
277
def update_concurrency (self ):
@@ -278,6 +284,7 @@ def update_concurrency(self):
278
284
concurrency = 1
279
285
self .job_data ["concurrency" ] = concurrency
280
286
self .app_session .factory ["PipelineSeries" ].concurrency = concurrency
287
+ self .logger .info (f"Settings change: concurrency = { concurrency } " )
281
288
282
289
stop_path = cf ("stop" )
283
290
def should_stop (self ):
@@ -298,6 +305,9 @@ def update_video(self):
298
305
@swallow_exception
299
306
def update_scrape (self ):
300
307
scrape = path_exists_with_cache (self .scrape_path )
308
+ if scrape == self .job_data ["scrape" ]:
309
+ return
310
+ self .logger .info (f"Settings change: scrape = { scrape } " )
301
311
self .job_data ["scrape" ] = scrape
302
312
if not scrape :
303
313
# Empty the list of scrapers, which will stop scraping for new URLs
@@ -329,6 +339,15 @@ def update_ignores(self):
329
339
for ig in sorted (ignores ):
330
340
self .print_to_terminal (f"\t { ig } " )
331
341
342
+ # Log changes
343
+ old_ignores = set (x [0 ] for x in self .compiled_ignores )
344
+ added_ignores = ignores - old_ignores
345
+ removed_ignores = old_ignores - ignores
346
+ for ig in added_ignores :
347
+ self .logger .info (f"Adding ignore: { ig } " )
348
+ for ig in removed_ignores :
349
+ self .logger .info (f"Removing ignore: { ig } " )
350
+
332
351
self .compiled_ignores = [(ig , re_compile (ig )) for ig in ignores ]
333
352
self .combined_ignore_regexp = compile_combined_regexp (ignores )
334
353
@@ -366,7 +385,9 @@ def accept_url(self, item_session: ItemSession, verdict: bool, reasons: dict):
366
385
if should_ignore :
367
386
if not self .job_data ["suppress_ignore_reports" ]:
368
387
pattern = self .get_specific_ignore_pattern (url )
369
- self .maybe_log_ignore (url , pattern )
388
+ else :
389
+ pattern = "[ignore pattern match]"
390
+ self .maybe_log_ignore (url , pattern )
370
391
return False
371
392
372
393
# If we get here, none of our ignores apply. Return the original verdict.
@@ -405,6 +426,7 @@ def handle_result(self, url_info, record_info, error_info, response):
405
426
return Actions .NORMAL
406
427
407
428
def maybe_log_ignore (self , url , pattern ):
429
+ self .logger .info (f"Ignoring ‘{ url } ’: { pattern } " )
408
430
if not self .job_data ["suppress_ignore_reports" ]:
409
431
self .print_to_terminal (f"IGNOR { url } \n by { pattern } " )
410
432
self .put_ws_queue ({
0 commit comments