33
33
GCP_PROJECT = os .environ .get ("GCP_PROJECT" , None )
34
34
BUCKET_NAME = os .environ .get ("BUCKET_NAME" , None )
35
35
SCRAPE_LAKE_PREFIX = os .environ .get ("BUCKET_PREFIX" , "legislation" )
36
+ DAG_RUN_START = os .environ .get ("DAG_RUN_START" , None )
36
37
37
38
38
39
class _Unset :
@@ -102,7 +103,7 @@ def do_scrape(
102
103
]
103
104
)
104
105
105
- last_scrape_end_datetime = datetime .datetime .utcnow ()
106
+ last_scrape_datetime = DAG_RUN_START or datetime .datetime .utcnow (). isoformat ()
106
107
for scraper_name , scrape_args in scrapers .items ():
107
108
ScraperCls = juris .scrapers [scraper_name ]
108
109
if (
@@ -131,7 +132,6 @@ def do_scrape(
131
132
file_archiving_enabled = args .archive ,
132
133
)
133
134
partial_report = scraper .do_scrape (** scrape_args , session = session )
134
- last_scrape_end_datetime = partial_report ["end" ]
135
135
stats .write_stats (
136
136
[
137
137
{
@@ -165,7 +165,6 @@ def do_scrape(
165
165
file_archiving_enabled = args .archive ,
166
166
)
167
167
report [scraper_name ] = scraper .do_scrape (** scrape_args )
168
- last_scrape_end_datetime = report [scraper_name ]["end" ]
169
168
session = scrape_args .get ("session" , "" )
170
169
if session :
171
170
stats .write_stats (
@@ -201,13 +200,13 @@ def do_scrape(
201
200
# optionally upload scrape output to cloud storage
202
201
# but do not archive if realtime mode enabled, as realtime mode has its own archiving process
203
202
if args .archive and not args .realtime :
204
- archive_to_cloud_storage (datadir , juris , last_scrape_end_datetime )
203
+ archive_to_cloud_storage (datadir , juris , last_scrape_datetime )
205
204
206
205
return report
207
206
208
207
209
208
def archive_to_cloud_storage (
210
- datadir : str , juris : State , last_scrape_end_datetime : datetime . datetime
209
+ datadir : str , juris : State , last_scrape_datetime : str
211
210
) -> None :
212
211
# check if we have necessary settings
213
212
if GCP_PROJECT is None or BUCKET_NAME is None :
@@ -224,7 +223,7 @@ def archive_to_cloud_storage(
224
223
bucket = cloud_storage_client .bucket (BUCKET_NAME )
225
224
jurisdiction_id = juris .jurisdiction_id .replace ("ocd-jurisdiction/" , "" )
226
225
destination_prefix = (
227
- f"{ SCRAPE_LAKE_PREFIX } /{ jurisdiction_id } /{ last_scrape_end_datetime . isoformat () } "
226
+ f"{ SCRAPE_LAKE_PREFIX } /{ jurisdiction_id } /{ last_scrape_datetime } "
228
227
)
229
228
230
229
# read files in directory and upload
0 commit comments