1515from selectolax .parser import HTMLParser
1616
1717from src .aws import is_aws_configured
18- from src .models . utils import from_jsonl , to_jsonl
18+ from src .local_store import read_meetings , write_meetings
1919
2020from .models .meeting import Meeting
2121
2222BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"
2323TGOV_BUCKET_NAME = "tgov-meetings"
2424MEETINGS_REGISTRY_PATH = "data/meetings.jsonl"
2525
26-
2726async def fetch_page (url : str , session : aiohttp .ClientSession ) -> str :
2827 """
2928 Fetch the HTML content of a page.
@@ -40,11 +39,9 @@ async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
4039 raise Exception (f"Failed to fetch { url } , status code: { response .status } " )
4140 return await response .text ()
4241
43-
4442def clean_date (date : str ) -> str :
4543 return re .sub (r"\s+" , " " , date ).strip ()
4644
47-
4845async def parse_meetings (html : str ) -> List [Dict [str , str ]]:
4946 """
5047 Parse the meeting data from the HTML content.
@@ -67,21 +64,18 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
6764 # Process each table
6865 for table in tables :
6966 for row in table .css ("tr.listingRow" ):
70- cells = row .css ("td" )
7167 name_cells = row .css ('td.listItem[headers^="Name"]' )
7268 meeting_name = name_cells [0 ].text ().strip () if name_cells else "Unknown"
7369
7470 date_cells = row .css ('td.listItem[headers^="Date"]' )
7571 raw_date = clean_date (date_cells [0 ].text ().strip ()) if date_cells else "Unknown"
7672 meeting_date = raw_date .split ("-" )[0 ].strip () if "-" in raw_date else raw_date
7773
78-
7974 duration_cells = row .css ('td.listItem[headers^="Duration"]' )
8075 duration_str = duration_cells [0 ].text ().strip () if duration_cells else "Unknown"
8176 minutes = duration_to_minutes (duration_str )
8277 meeting_duration = f"{ minutes // 60 } :{ minutes % 60 :02d} " if minutes is not None else "Unknown"
8378
84-
8579 meeting_data = {
8680 "meeting" : meeting_name ,
8781 "date" : meeting_date ,
@@ -131,7 +125,6 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
131125
132126 return meetings
133127
134-
135128async def get_tgov_meetings () -> Sequence [Meeting ]:
136129 """
137130 Fetch and parse meeting data from the Government Access Television website.
@@ -147,7 +140,6 @@ async def get_tgov_meetings() -> Sequence[Meeting]:
147140 meetings = [Meeting (** meeting_dict ) for meeting_dict in meeting_dicts ]
148141 return meetings
149142
150-
151143def duration_to_minutes (duration ):
152144 if not duration or pd .isna (duration ):
153145 return None
@@ -172,43 +164,25 @@ def duration_to_minutes(duration):
172164 except :
173165 return None
174166
175-
176167def get_registry_meetings () -> Sequence [Meeting ]:
177168 if is_aws_configured ():
178- print (f'Getting registry from AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
179- import boto3
180- from botocore .exceptions import ClientError
181- s3 = boto3 .client ('s3' )
182- try :
183- registry_response = s3 .get_object (Bucket = TGOV_BUCKET_NAME , Key = MEETINGS_REGISTRY_PATH )
184- registry_body = registry_response ['Body' ].read ().decode ('utf-8' )
185- return from_jsonl (registry_body , Meeting )
186- except ClientError as e :
187- if e .response ['Error' ]['Code' ] == 'NoSuchKey' :
188- print ('No registry file found on S3. Returning empty list.' )
189-
190- return []
191-
169+ print (f'Getting registry from DynamoDB.' )
170+ return list (Meeting .scan ())
171+ else :
172+ print (f'Getting registry from local store' )
173+ return read_meetings ()
192174
193175def write_registry_meetings (meetings : Sequence [Meeting ]) -> Sequence [Meeting ]:
194- jsonl_str = to_jsonl (meetings )
195-
196176 if is_aws_configured ():
197- print (f'Writing registry to AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
198- import boto3
199- from botocore .exceptions import ClientError
200- s3 = boto3 .client ('s3' )
201-
202- try :
203- s3 .put_object (
204- Bucket = TGOV_BUCKET_NAME ,
205- Key = MEETINGS_REGISTRY_PATH ,
206- Body = jsonl_str ,
207- ContentType = 'application/x-ndjson'
208- )
209- print (f'Wrote { len (meetings )} meetings to S3.' )
210- except ClientError as e :
211- print (f"Failed to write to S3: { e } " )
212- raise
177+ print (f'Writing registry to DynamoDB.' )
178+ with Meeting .batch_writer ():
179+ for meeting in meetings :
180+ if meeting .clip_id :
181+ meeting .save ()
182+ else :
183+ print (f'Skipping meeting with missing clip_id: { meeting } ' )
184+ else :
185+ print (f'Writing registry to local store' )
186+ write_meetings (meetings )
213187
214188 return meetings
0 commit comments