Skip to content

Commit b8344f6

Browse files
authored
feat: add max_file_size support for the GoogleDrive source, solves #1250 (#1269)
1 parent e410a59 commit b8344f6

File tree

3 files changed

+23
-2
lines changed

3 files changed

+23
-2
lines changed

docs/docs/sources/googledrive.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ The spec takes the following fields:
3232
* `recent_changes_poll_interval` (`datetime.timedelta`, optional): when set, this source provides a change capture mechanism by polling Google Drive for recent modified files periodically.
3333
* `included_patterns` (`list[str]`, optional): a list of glob patterns to include files, e.g. `["*.txt", "docs/**/*.md"]`. If not specified, all files will be included.
3434
* `excluded_patterns` (`list[str]`, optional): a list of glob patterns to exclude files, e.g. `["tmp", "**/node_modules"]`. Any file or directory matching these patterns will be excluded even if they match `included_patterns`. If not specified, no files will be excluded.
35+
* `max_file_size` (`int`, optional): when set, any source file exceeding the limit (in bytes) will be ignored.
3536

3637
:::info
3738

python/cocoindex/sources/_engine_builtin_specs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class GoogleDrive(op.SourceSpec):
4444
# See https://docs.rs/globset/latest/globset/index.html#syntax for the syntax of the patterns.
4545
excluded_patterns: list[str] | None = None
4646

47+
max_file_size: int | None = None
4748
recent_changes_poll_interval: datetime.timedelta | None = None
4849

4950

src/ops/sources/google_drive.rs

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ pub struct Spec {
6262
recent_changes_poll_interval: Option<std::time::Duration>,
6363
included_patterns: Option<Vec<String>>,
6464
excluded_patterns: Option<Vec<String>>,
65+
max_file_size: Option<i64>,
6566
}
6667

6768
struct Executor {
@@ -70,6 +71,7 @@ struct Executor {
7071
root_folder_ids: IndexSet<Arc<str>>,
7172
recent_updates_poll_interval: Option<std::time::Duration>,
7273
pattern_matcher: PatternMatcher,
74+
max_file_size: Option<i64>,
7375
}
7476

7577
impl Executor {
@@ -97,6 +99,7 @@ impl Executor {
9799
root_folder_ids: spec.root_folder_ids.into_iter().map(Arc::from).collect(),
98100
recent_updates_poll_interval: spec.recent_changes_poll_interval,
99101
pattern_matcher: PatternMatcher::new(spec.included_patterns, spec.excluded_patterns)?,
102+
max_file_size: spec.max_file_size,
100103
})
101104
}
102105
}
@@ -303,7 +306,7 @@ impl SourceExecutor for Executor {
303306
let mut seen_ids = HashSet::new();
304307
let mut folder_ids = self.root_folder_ids.clone();
305308
let fields = format!(
306-
"files(id,name,mimeType,trashed{})",
309+
"files(id,name,mimeType,trashed,size{})",
307310
optional_modified_time(options.include_ordinal)
308311
);
309312
let mut new_folder_ids = Vec::new();
@@ -319,6 +322,12 @@ impl SourceExecutor for Executor {
319322
if !file.name.as_deref().is_some_and(|name| self.pattern_matcher.is_file_included(name)){
320323
continue
321324
}
325+
if let Some(max_size) = self.max_file_size
326+
&& let Some(file_size) = file.size
327+
&& file_size > max_size {
328+
// Skip files over the specified limit
329+
continue;
330+
}
322331
curr_rows.extend(self.visit_file(file, &mut new_folder_ids, &mut seen_ids)?);
323332
}
324333
if !curr_rows.is_empty() {
@@ -342,7 +351,7 @@ impl SourceExecutor for Executor {
342351
) -> Result<PartialSourceRowData> {
343352
let file_id = key.single_part()?.str_value()?;
344353
let fields = format!(
345-
"id,name,mimeType,trashed{}",
354+
"id,name,mimeType,trashed,size{}",
346355
optional_modified_time(options.include_ordinal)
347356
);
348357
let resp = self
@@ -375,6 +384,16 @@ impl SourceExecutor for Executor {
375384
content_version_fp: None,
376385
});
377386
}
387+
if let Some(max_size) = self.max_file_size
388+
&& let Some(file_size) = file.size
389+
&& file_size > max_size
390+
{
391+
return Ok(PartialSourceRowData {
392+
value: Some(SourceValue::NonExistence),
393+
ordinal: Some(Ordinal::unavailable()),
394+
content_version_fp: None,
395+
});
396+
}
378397
let ordinal = if options.include_ordinal {
379398
file.modified_time.map(|t| t.try_into()).transpose()?
380399
} else {

0 commit comments

Comments
 (0)