11"""Loader that loads YouTube transcript."""
22from __future__ import annotations
33
4+ import logging
45from pathlib import Path
56from typing import Any , Dict , List , Optional
67
1011from langchain .docstore .document import Document
1112from langchain .document_loaders .base import BaseLoader
1213
13- SCOPES = ["https://www.googleapis.com/auth/drive.readonly" ]
14+ logger = logging .getLogger (__name__ )
15+
16+ SCOPES = ["https://www.googleapis.com/auth/youtube.readonly" ]
1417
1518
1619@dataclass
@@ -98,12 +101,17 @@ class YoutubeLoader(BaseLoader):
98101 """Loader that loads Youtube transcripts."""
99102
100103 def __init__ (
101- self , video_id : str , add_video_info : bool = False , language : str = "en"
104+ self ,
105+ video_id : str ,
106+ add_video_info : bool = False ,
107+ language : str = "en" ,
108+ continue_on_failure : bool = False ,
102109 ):
103110 """Initialize with YouTube video ID."""
104111 self .video_id = video_id
105112 self .add_video_info = add_video_info
106113 self .language = language
114+ self .continue_on_failure = continue_on_failure
107115
108116 @classmethod
109117 def from_youtube_url (cls , youtube_url : str , ** kwargs : Any ) -> YoutubeLoader :
@@ -217,6 +225,7 @@ class GoogleApiYoutubeLoader(BaseLoader):
217225 video_ids : Optional [List [str ]] = None
218226 add_video_info : bool = True
219227 captions_language : str = "en"
228+ continue_on_failure : bool = False
220229
221230 def __post_init__ (self ) -> None :
222231 self .youtube_client = self ._build_youtube_client (self .google_api_client .creds )
@@ -249,12 +258,13 @@ def validate_channel_or_videoIds_is_set(
249258 def _get_transcripe_for_video_id (self , video_id : str ) -> str :
250259 from youtube_transcript_api import NoTranscriptFound , YouTubeTranscriptApi
251260
252- transcript_list = YouTubeTranscriptApi .list_transcripts (self . video_ids )
261+ transcript_list = YouTubeTranscriptApi .list_transcripts (video_id )
253262 try :
254263 transcript = transcript_list .find_transcript ([self .captions_language ])
255264 except NoTranscriptFound :
256- en_transcript = transcript_list .find_transcript (["en" ])
257- transcript = en_transcript .translate (self .captions_language )
265+ for available_transcript in transcript_list :
266+ transcript = available_transcript .translate (self .captions_language )
267+ continue
258268
259269 transcript_pieces = transcript .fetch ()
260270 return " " .join ([t ["text" ].strip (" " ) for t in transcript_pieces ])
@@ -286,6 +296,19 @@ def _get_channel_id(self, channel_name: str) -> str:
286296 return channel_id
287297
288298 def _get_document_for_channel (self , channel : str , ** kwargs : Any ) -> List [Document ]:
299+ try :
300+ from youtube_transcript_api import (
301+ NoTranscriptFound ,
302+ TranscriptsDisabled ,
303+ )
304+ except ImportError :
305+ raise ImportError (
306+ "You must run"
307+ "`pip install --upgrade "
308+ "youtube-transcript-api`"
309+ "to use the youtube loader"
310+ )
311+
289312 channel_id = self ._get_channel_id (channel )
290313 request = self .youtube_client .search ().list (
291314 part = "id,snippet" ,
@@ -304,14 +327,25 @@ def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Documen
304327 if self .add_video_info :
305328 item ["snippet" ].pop ("thumbnails" )
306329 meta_data .update (item ["snippet" ])
307- video_ids .append (
308- Document (
309- page_content = self ._get_transcripe_for_video_id (
310- item ["id" ]["videoId" ]
311- ),
312- metadata = meta_data ,
330+ try :
331+ page_content = self ._get_transcripe_for_video_id (
332+ item ["id" ]["videoId" ]
313333 )
314- )
334+ video_ids .append (
335+ Document (
336+ page_content = page_content ,
337+ metadata = meta_data ,
338+ )
339+ )
340+ except (TranscriptsDisabled , NoTranscriptFound ) as e :
341+ if self .continue_on_failure :
342+ logger .error (
343+ "Error fetching transscript "
344+ + f" { item ['id' ]['videoId' ]} , exception: { e } "
345+ )
346+ else :
347+ raise e
348+ pass
315349 request = self .youtube_client .search ().list_next (request , response )
316350
317351 return video_ids
0 commit comments