66Television websites.
77"""
88
9- from typing import Dict , List
9+ import re
10+ from typing import Dict , List , Sequence
1011from urllib .parse import urljoin
1112
1213import aiohttp
1314import pandas as pd
1415from selectolax .parser import HTMLParser
1516
17+ from src .aws import is_aws_configured
18+ from src .models .utils import from_jsonl , to_jsonl
19+
1620from .models .meeting import Meeting
1721
1822BASE_URL = "https://tulsa-ok.granicus.com/ViewPublisher.php?view_id=4"
23+ TGOV_BUCKET_NAME = "tgov-meetings"
24+ MEETINGS_REGISTRY_PATH = "data/meetings.jsonl"
1925
2026
2127async def fetch_page (url : str , session : aiohttp .ClientSession ) -> str :
@@ -35,6 +41,10 @@ async def fetch_page(url: str, session: aiohttp.ClientSession) -> str:
3541 return await response .text ()
3642
3743
44+ def clean_date (date : str ) -> str :
45+ return re .sub (r"\s+" , " " , date ).strip ()
46+
47+
3848async def parse_meetings (html : str ) -> List [Dict [str , str ]]:
3949 """
4050 Parse the meeting data from the HTML content.
@@ -56,76 +66,73 @@ async def parse_meetings(html: str) -> List[Dict[str, str]]:
5666
5767 # Process each table
5868 for table in tables :
59- # Find the tbody section which contains the actual meeting rows
60- tbody = table .css_first ("tbody" )
61- if not tbody :
62- continue
63-
64- # Process each row in the tbody
65- for row in tbody .css ("tr" ):
69+ for row in table .css ("tr.listingRow" ):
6670 cells = row .css ("td" )
67- if len (cells ) < 5 :
68- continue
71+ name_cells = row .css ('td.listItem[headers^="Name"]' )
72+ meeting_name = name_cells [0 ].text ().strip () if name_cells else "Unknown"
73+
74+ date_cells = row .css ('td.listItem[headers^="Date"]' )
75+ raw_date = clean_date (date_cells [0 ].text ().strip ()) if date_cells else "Unknown"
76+ meeting_date = raw_date .split ("-" )[0 ].strip () if "-" in raw_date else raw_date
77+
78+
79+ duration_cells = row .css ('td.listItem[headers^="Duration"]' )
80+ duration_str = duration_cells [0 ].text ().strip () if duration_cells else "Unknown"
81+ minutes = duration_to_minutes (duration_str )
82+ meeting_duration = f"{ minutes // 60 } :{ minutes % 60 :02d} " if minutes is not None else "Unknown"
83+
6984
7085 meeting_data = {
71- "meeting" : cells [ 0 ]. text (). strip () ,
72- "date" : cells [ 1 ]. text (). strip () ,
73- "duration" : cells [ 2 ]. text (). strip () ,
86+ "meeting" : meeting_name ,
87+ "date" : meeting_date ,
88+ "duration" : meeting_duration ,
7489 "agenda" : None ,
90+ "clip_id" : None ,
7591 "video" : None ,
7692 }
7793
7894 # Extract agenda link if available
79- agenda_cell = cells [ 3 ]
80- agenda_link = agenda_cell .css_first ("a" )
81- if agenda_link and agenda_link . attributes . get ( "href" ) :
95+ agenda_cells = row . css ( 'td.listItem:has(a[href*="AgendaViewer.php"])' )
96+ agenda_link = agenda_cells [ 0 ] .css_first ("a" ) if agenda_cells else None
97+ if agenda_link is not None :
8298 meeting_data ["agenda" ] = urljoin (
8399 BASE_URL , agenda_link .attributes .get ("href" )
84100 )
85101
86102 # Extract video link if available
87- video_cell = cells [4 ]
88- video_link = video_cell .css_first ("a" )
89- if video_link :
90- # First try to extract from onclick attribute
103+ video_cells = row .css ('td.listItem[headers^="VideoLink"]' )
104+ video_cell = video_cells [0 ] if video_cells else None
105+ if video_cell is not None :
106+ video_link = video_cell .css_first ("a" )
107+
91108 onclick = video_link .attributes .get ("onclick" , "" )
92- if onclick :
93- # Look for window.open pattern
94- if "window.open(" in onclick :
95- # Extract URL from window.open('URL', ...)
96- start_quote = onclick .find ("'" , onclick .find ("window.open(" ))
97- end_quote = onclick .find ("'" , start_quote + 1 )
98- if start_quote > 0 and end_quote > start_quote :
99- video_url = onclick [start_quote + 1 : end_quote ]
100- # Handle protocol-relative URLs (starting with //)
101- if video_url .startswith ("//" ):
102- video_url = f"https:{ video_url } "
103- meeting_data ["video" ] = video_url
104-
105- # If onclick extraction failed, try href
106- if meeting_data ["video" ] is None and video_link .attributes .get ("href" ):
107- href = video_link .attributes .get ("href" )
108- # Handle javascript: hrefs
109- if href .startswith ("javascript:" ):
110- # Try to extract clip_id from the onclick attribute again
111- # This handles cases where href is javascript:void(0) but onclick has the real URL
112- if meeting_data ["video" ] is None and "clip_id=" in onclick :
113- start_idx = onclick .find ("clip_id=" )
114- end_idx = onclick .find ("'" , start_idx )
115- if start_idx > 0 and end_idx > start_idx :
116- clip_id = onclick [start_idx + 8 : end_idx ]
117- meeting_data ["video" ] = (
118- f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={ clip_id } "
119- )
109+ onclick_match = re .search (r"window\.open\(['\"](//[^'\"]+)['\"]" , onclick )
110+ clip_id_exp = r"clip_id=(\d+)"
111+
112+ if onclick_match :
113+ meeting_data ["video" ] = f"https:{ onclick_match .group (1 )} "
114+ clip_id_match = re .search (clip_id_exp , onclick )
115+ if clip_id_match :
116+ meeting_data ["clip_id" ] = clip_id_match .group (1 )
120117 else :
121- meeting_data ["video" ] = urljoin (BASE_URL , href )
118+ meeting_data ["clip_id" ] = None
119+ if not meeting_data ["video" ]:
120+ href = video_link .attributes .get ("href" , "" )
121+ if href .startswith ("javascript:" ):
122+ clip_id_match = re .search (clip_id_exp , href )
123+ if clip_id_match :
124+ clip_id = clip_id_match .group (1 )
125+ meeting_data ["clip_id" ] = clip_id
126+ meeting_data ["video" ] = f"https://tulsa-ok.granicus.com/MediaPlayer.php?view_id=4&clip_id={ clip_id } "
127+ else :
128+ meeting_data ["video" ] = urljoin (BASE_URL , href )
122129
123130 meetings .append (meeting_data )
124131
125132 return meetings
126133
127134
128- async def get_meetings () -> List [Meeting ]:
135+ async def get_tgov_meetings () -> Sequence [Meeting ]:
129136 """
130137 Fetch and parse meeting data from the Government Access Television website.
131138
@@ -164,3 +171,44 @@ def duration_to_minutes(duration):
164171 return hours * 60 + minutes
165172 except :
166173 return None
174+
175+
176+ def get_registry_meetings () -> Sequence [Meeting ]:
177+ if is_aws_configured ():
178+ print (f'Getting registry from AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
179+ import boto3
180+ from botocore .exceptions import ClientError
181+ s3 = boto3 .client ('s3' )
182+ try :
183+ registry_response = s3 .get_object (Bucket = TGOV_BUCKET_NAME , Key = MEETINGS_REGISTRY_PATH )
184+ registry_body = registry_response ['Body' ].read ().decode ('utf-8' )
185+ return from_jsonl (registry_body , Meeting )
186+ except ClientError as e :
187+ if e .response ['Error' ]['Code' ] == 'NoSuchKey' :
188+ print ('No registry file found on S3. Returning empty list.' )
189+
190+ return []
191+
192+
193+ def write_registry_meetings (meetings : Sequence [Meeting ]) -> Sequence [Meeting ]:
194+ jsonl_str = to_jsonl (meetings )
195+
196+ if is_aws_configured ():
197+ print (f'Writing registry to AWS S3 bucket: { TGOV_BUCKET_NAME } , path: { MEETINGS_REGISTRY_PATH } ' )
198+ import boto3
199+ from botocore .exceptions import ClientError
200+ s3 = boto3 .client ('s3' )
201+
202+ try :
203+ s3 .put_object (
204+ Bucket = TGOV_BUCKET_NAME ,
205+ Key = MEETINGS_REGISTRY_PATH ,
206+ Body = jsonl_str ,
207+ ContentType = 'application/x-ndjson'
208+ )
209+ print (f'Wrote { len (meetings )} meetings to S3.' )
210+ except ClientError as e :
211+ print (f"Failed to write to S3: { e } " )
212+ raise
213+
214+ return meetings
0 commit comments