Skip to content

Commit 199cb85

Browse files
Improve youtube loader (langchain-ai#3395)
Small improvements for the YouTube loader: a) use the YouTube API permission scope instead of Google Drive b) bugfix: allow transcript loading for single videos c) an additional parameter "continue_on_failure" for cases when videos in a playlist do not have transcription enabled. d) support automated translation for all languages, if available. --------- Co-authored-by: Johann-Peter Hartmann <johann-peter.hartmann@mayflower.de>
1 parent e5ffbee commit 199cb85

File tree

1 file changed

+46
-12
lines changed

1 file changed

+46
-12
lines changed

langchain/document_loaders/youtube.py

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Loader that loads YouTube transcript."""
22
from __future__ import annotations
33

4+
import logging
45
from pathlib import Path
56
from typing import Any, Dict, List, Optional
67

@@ -10,7 +11,9 @@
1011
from langchain.docstore.document import Document
1112
from langchain.document_loaders.base import BaseLoader
1213

13-
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
14+
logger = logging.getLogger(__name__)
15+
16+
SCOPES = ["https://www.googleapis.com/auth/youtube.readonly"]
1417

1518

1619
@dataclass
@@ -98,12 +101,17 @@ class YoutubeLoader(BaseLoader):
98101
"""Loader that loads Youtube transcripts."""
99102

100103
def __init__(
101-
self, video_id: str, add_video_info: bool = False, language: str = "en"
104+
self,
105+
video_id: str,
106+
add_video_info: bool = False,
107+
language: str = "en",
108+
continue_on_failure: bool = False,
102109
):
103110
"""Initialize with YouTube video ID."""
104111
self.video_id = video_id
105112
self.add_video_info = add_video_info
106113
self.language = language
114+
self.continue_on_failure = continue_on_failure
107115

108116
@classmethod
109117
def from_youtube_url(cls, youtube_url: str, **kwargs: Any) -> YoutubeLoader:
@@ -217,6 +225,7 @@ class GoogleApiYoutubeLoader(BaseLoader):
217225
video_ids: Optional[List[str]] = None
218226
add_video_info: bool = True
219227
captions_language: str = "en"
228+
continue_on_failure: bool = False
220229

221230
def __post_init__(self) -> None:
222231
self.youtube_client = self._build_youtube_client(self.google_api_client.creds)
@@ -249,12 +258,13 @@ def validate_channel_or_videoIds_is_set(
249258
def _get_transcripe_for_video_id(self, video_id: str) -> str:
250259
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
251260

252-
transcript_list = YouTubeTranscriptApi.list_transcripts(self.video_ids)
261+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
253262
try:
254263
transcript = transcript_list.find_transcript([self.captions_language])
255264
except NoTranscriptFound:
256-
en_transcript = transcript_list.find_transcript(["en"])
257-
transcript = en_transcript.translate(self.captions_language)
265+
for available_transcript in transcript_list:
266+
transcript = available_transcript.translate(self.captions_language)
267+
continue
258268

259269
transcript_pieces = transcript.fetch()
260270
return " ".join([t["text"].strip(" ") for t in transcript_pieces])
@@ -286,6 +296,19 @@ def _get_channel_id(self, channel_name: str) -> str:
286296
return channel_id
287297

288298
def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Document]:
299+
try:
300+
from youtube_transcript_api import (
301+
NoTranscriptFound,
302+
TranscriptsDisabled,
303+
)
304+
except ImportError:
305+
raise ImportError(
306+
"You must run"
307+
"`pip install --upgrade "
308+
"youtube-transcript-api`"
309+
"to use the youtube loader"
310+
)
311+
289312
channel_id = self._get_channel_id(channel)
290313
request = self.youtube_client.search().list(
291314
part="id,snippet",
@@ -304,14 +327,25 @@ def _get_document_for_channel(self, channel: str, **kwargs: Any) -> List[Documen
304327
if self.add_video_info:
305328
item["snippet"].pop("thumbnails")
306329
meta_data.update(item["snippet"])
307-
video_ids.append(
308-
Document(
309-
page_content=self._get_transcripe_for_video_id(
310-
item["id"]["videoId"]
311-
),
312-
metadata=meta_data,
330+
try:
331+
page_content = self._get_transcripe_for_video_id(
332+
item["id"]["videoId"]
313333
)
314-
)
334+
video_ids.append(
335+
Document(
336+
page_content=page_content,
337+
metadata=meta_data,
338+
)
339+
)
340+
except (TranscriptsDisabled, NoTranscriptFound) as e:
341+
if self.continue_on_failure:
342+
logger.error(
343+
"Error fetching transscript "
344+
+ f" {item['id']['videoId']}, exception: {e}"
345+
)
346+
else:
347+
raise e
348+
pass
315349
request = self.youtube_client.search().list_next(request, response)
316350

317351
return video_ids

0 commit comments

Comments
 (0)