Skip to content

Commit 11ad07e

Browse files
authored
Merge pull request #171 from openstates/add-scrape-metadata
Add scrape metadata
2 parents 98b0821 + 467370b commit 11ad07e

File tree

9 files changed

+60
-8
lines changed

9 files changed

+60
-8
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog
22

3+
## 6.22.0 - Apr 22, 2025
4+
* Add metadata to scraper output
5+
36
## 6.21.6 - Apr 21, 2025
47
* Add informal passage to bill action classification
58

openstates/scrape/base.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def save_object(self, obj):
164164
Generally shouldn't be called directly.
165165
"""
166166
clean_whitespace(obj)
167-
obj.pre_save(self.jurisdiction.jurisdiction_id)
167+
obj.pre_save(self.jurisdiction)
168168

169169
filename = f"{obj._type}_{obj._id}.json".replace("/", "-")
170170
self.info(f"save {obj._type} {obj} as {filename}")
@@ -346,7 +346,7 @@ def validate(self, schema=None):
346346
)
347347
)
348348

349-
def pre_save(self, jurisdiction_id):
349+
def pre_save(self, jurisdiction):
350350
pass
351351

352352
def as_dict(self):
@@ -366,6 +366,17 @@ def __setattr__(self, key, val):
366366
)
367367
super(BaseModel, self).__setattr__(key, val)
368368

369+
def add_scrape_metadata(self, jurisdiction):
370+
"""Add scrape metadata"""
371+
self.jurisdiction = {
372+
"id": jurisdiction.jurisdiction_id,
373+
"name": jurisdiction.name,
374+
"classification": jurisdiction.classification,
375+
"division_id": jurisdiction.division_id,
376+
}
377+
378+
self.scraped_at = utils.utcnow()
379+
369380

370381
class SourceMixin(object):
371382
def __init__(self):

openstates/scrape/bill.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,10 @@ def __init__(
5151
self.versions = []
5252
self.citations = []
5353

54-
def pre_save(self, jurisdiction_id):
54+
def pre_save(self, jurisdiction):
5555
# ensure subject is sorted for idempotent JSON output
5656
self.subject = sorted(self.subject)
57+
self.add_scrape_metadata(jurisdiction)
5758

5859
def add_action(
5960
self,
@@ -76,7 +77,14 @@ def add_action(
7677
return action
7778

7879
def add_citation(
79-
self, publication, citation, citation_type, *, effective=None, expires=None, url=None
80+
self,
81+
publication,
82+
citation,
83+
citation_type,
84+
*,
85+
effective=None,
86+
expires=None,
87+
url=None,
8088
):
8189
self.citations.append(
8290
{

openstates/scrape/event.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,3 +212,6 @@ def add_document(
212212
date=date,
213213
classification=classification,
214214
)
215+
216+
def pre_save(self, jurisdiction):
217+
self.add_scrape_metadata(jurisdiction)

openstates/scrape/schemas/bill.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,13 @@
22
Schema for bill objects.
33
"""
44
import copy
5-
from .common import sources, extras, fuzzy_date_blank, fuzzy_datetime
5+
from .common import (
6+
sources,
7+
extras,
8+
fuzzy_date_blank,
9+
fuzzy_datetime,
10+
jurisdiction_summary,
11+
)
612
from ...data import common
713

814
versions_or_documents = {
@@ -164,5 +170,7 @@
164170
},
165171
"sources": sources,
166172
"extras": extras,
173+
"jurisdiction": jurisdiction_summary,
174+
"scraped_at": fuzzy_datetime,
167175
},
168176
}

openstates/scrape/schemas/common.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,14 @@
6262
}
6363

6464
extras = {"type": "object"}
65+
66+
jurisdiction_summary = {
67+
"items": {
68+
"properties": {
69+
"id": {"type": "string", "minLength": 1},
70+
"name": {"type": "string", "minLength": 1},
71+
"classification": {"type": "string", "minLength": 1},
72+
"division_id": {"type": "string", "minLength": 1},
73+
},
74+
},
75+
}

openstates/scrape/schemas/event.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
fuzzy_date_blank,
99
fuzzy_datetime,
1010
fuzzy_datetime_blank,
11+
jurisdiction_summary,
1112
)
1213

1314
media_schema = {
@@ -134,6 +135,8 @@
134135
"sources": sources,
135136
"extras": extras,
136137
"dedupe_key": {"type": ["string", "null"], "minLength": 1},
138+
"jurisdiction": jurisdiction_summary,
139+
"scraped_at": fuzzy_datetime,
137140
},
138141
"type": "object",
139142
}

openstates/scrape/tests/test_bill_scrape.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
import pytest
22
import warnings
33
from datetime import date
4-
from openstates.scrape import Bill
4+
from openstates.scrape import Bill, State
55
from openstates.utils.generic import get_pseudo_id
66
from openstates.exceptions import ScrapeValueError
77

88

9+
class NewJersey(State):
10+
pass
11+
12+
913
def toy_bill():
1014
b = Bill(
1115
identifier="HB 2017",
@@ -311,10 +315,11 @@ def test_str():
311315

312316
def test_pre_save():
313317
b = toy_bill()
318+
j = NewJersey()
314319
b.add_subject("ZZZ")
315320
b.add_subject("AAA")
316321
b.add_subject("MMM")
317-
b.pre_save(None)
322+
b.pre_save(j)
318323
assert b.subject == ["AAA", "MMM", "ZZZ"]
319324

320325

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "openstates"
3-
version = "6.21.6"
3+
version = "6.22.0"
44
description = "core infrastructure for the openstates project"
55
authors = ["James Turk <[email protected]>"]
66
license = "MIT"

0 commit comments

Comments
 (0)