Skip to content

Commit 2f26d66

Browse files
authored
Merge pull request #2045 from getappmap/feat/ignore-large-data-files
feat: Ignore large data files
2 parents a097d76 + 4b10599 commit 2f26d66

File tree

3 files changed

+96
-60
lines changed

3 files changed

+96
-60
lines changed

packages/cli/src/fulltext/FileIndex.ts

Lines changed: 78 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -173,73 +173,85 @@ export async function buildFileIndex(
173173
}
174174

175175
const BINARY_FILE_EXTENSIONS: string[] = [
176-
'png',
177-
'jpg',
178-
'jpeg',
179-
'gif',
176+
'7z',
177+
'aac',
178+
'avi',
180179
'bmp',
181-
'ico',
182-
'tiff',
183-
'webp',
184-
'svg',
185-
'mp3',
186-
'wav',
187-
'ogg',
180+
'bz2',
181+
'class',
182+
'dll',
183+
'doc',
184+
'docx',
185+
'dylib',
186+
'ear',
187+
'exe',
188+
'eot',
188189
'flac',
189-
'aac',
190-
'mp4',
191-
'webm',
190+
'flv',
191+
'gif',
192+
'gz',
193+
'ico',
194+
'jar',
195+
'jpeg',
196+
'jpg',
197+
'min.js',
198+
'min.css',
199+
'mjs',
192200
'mkv',
193-
'avi',
201+
'mo',
194202
'mov',
195-
'wmv',
203+
'mp3',
204+
'mp4',
196205
'mpg',
197-
'flv',
198-
'zip',
199-
'tar',
200-
'gz',
201-
'bz2',
202-
'xz',
203-
'7z',
204-
'rar',
206+
'odt',
207+
'odp',
208+
'ods',
209+
'ogg',
210+
'otf',
205211
'pdf',
206-
'doc',
207-
'docx',
208-
'xls',
209-
'xlsx',
212+
'po',
213+
'png',
210214
'ppt',
211215
'pptx',
212-
'odt',
213-
'ods',
214-
'odp',
216+
'pyc',
217+
'rar',
215218
'rtf',
219+
'so',
220+
'svg',
221+
'tar',
222+
'tiff',
223+
'ttf',
224+
'wav',
225+
'webm',
226+
'webp',
216227
'woff',
217228
'woff2',
218-
'eot',
219-
'ttf',
220-
'otf',
221-
'mo',
222-
'po',
223-
'pyc',
224-
'ico',
225-
'flv',
226-
'avi',
227-
'mov',
228229
'wmv',
229-
'mpg',
230-
'jar',
231-
'war',
232-
'ear',
233-
'class',
234-
'so',
235-
'dll',
236-
'dylib',
237-
'o',
238-
'exe',
239-
'min.js',
240-
'min.css',
230+
'xls',
231+
'xlsx',
232+
'xz',
233+
'zip',
241234
].map((ext) => '.' + ext);
242235

236+
const DATA_FILE_EXTENSIONS: string[] = [
237+
'csv',
238+
'dat',
239+
'log',
240+
'json',
241+
'tsv',
242+
'yaml',
243+
'yml',
244+
'xml',
245+
].map((ext) => '.' + ext);
246+
247+
const isBinaryFile = (fileName: string) => {
248+
return BINARY_FILE_EXTENSIONS.some((ext) => fileName.endsWith(ext));
249+
};
250+
251+
const isDataFile = (fileName: string) => {
252+
return DATA_FILE_EXTENSIONS.some((ext) => fileName.endsWith(ext));
253+
};
254+
243255
export async function filterFiles(
244256
directory: string,
245257
fileNames: string[],
@@ -248,23 +260,31 @@ export async function filterFiles(
248260
): Promise<string[]> {
249261
const result: string[] = [];
250262
for (const fileName of fileNames) {
251-
const fileExtension = path.extname(fileName).toLowerCase();
252-
if (BINARY_FILE_EXTENSIONS.some((ext) => ext === fileExtension)) continue;
263+
if (isBinaryFile(fileName)) continue;
253264

254265
const includeFile = fileNameMatchesFilterPatterns(fileName, includePatterns, excludePatterns);
255266
if (!includeFile) continue;
256267

268+
let appendFile = false;
257269
try {
258270
const stats = await stat(join(directory, fileName));
259271
if (stats.isFile()) {
260-
if (stats.size > 50_000) debug(`WARNING Large file ${fileName} with size ${stats.size}`);
261-
262-
result.push(fileName);
272+
appendFile = true;
273+
if (stats.size > 50_000) {
274+
if (isDataFile(fileName)) {
275+
debug(`Skipping large data file ${fileName} with size ${stats.size}`);
276+
appendFile = false;
277+
} else {
278+
debug(`WARNING Large file ${fileName} with size ${stats.size}`);
279+
}
280+
}
263281
}
264282
} catch (error) {
265283
console.warn(`Error checking file ${fileName}`);
266284
console.warn(error);
267285
}
286+
287+
if (appendFile) result.push(fileName);
268288
}
269289
return result;
270290
}

packages/cli/tests/unit/fulltext/FileIndex.spec.ts

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,16 +155,31 @@ describe(filterFiles, () => {
155155
jest.resetAllMocks();
156156
});
157157

158-
it('filters out files with binary extensions and non-files', async () => {
158+
it('filters out binary files, non-files, and large data files', async () => {
159159
const dir = tmp.dirSync({ unsafeCleanup: true }).name;
160160
writeFileSync(join(dir, 'file.txt'), 'hello');
161161
writeFileSync(join(dir, 'file.zip'), 'hello');
162162
writeFileSync(join(dir, 'file.json'), 'hello');
163163
writeFileSync(join(dir, 'large.txt'), Buffer.alloc(100_000));
164+
writeFileSync(join(dir, 'large.js'), Buffer.alloc(100_000));
165+
writeFileSync(join(dir, 'large.ts'), Buffer.alloc(100_000));
166+
writeFileSync(join(dir, 'large.haml'), Buffer.alloc(100_000));
167+
writeFileSync(join(dir, 'large.java'), Buffer.alloc(100_000));
168+
writeFileSync(join(dir, 'large.mjs'), Buffer.alloc(100_000));
169+
writeFileSync(join(dir, 'large.pyc'), Buffer.alloc(100_000));
170+
writeFileSync(join(dir, 'large.json'), Buffer.alloc(100_000));
164171
mkdirSync(join(dir, 'dir'));
165172

166173
const fileList = readdirSync(dir);
167174
const filtered = await filterFiles(dir, fileList);
168-
expect(filtered).toEqual(['file.json', 'file.txt', 'large.txt']);
175+
expect(filtered).toEqual([
176+
'file.json',
177+
'file.txt',
178+
'large.haml',
179+
'large.java',
180+
'large.js',
181+
'large.ts',
182+
'large.txt',
183+
]);
169184
});
170185
});

packages/cli/tests/unit/rpc/explain/pattern.spec.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ describe('Regex patterns', () => {
1717
{ path: '_.navie', pattern: EXCLUDE_DOT_NAVIE_DIR, shouldMatch: false },
1818
{ path: '._navie', pattern: EXCLUDE_DOT_NAVIE_DIR, shouldMatch: false },
1919
{ path: '/navie/.navie/', pattern: EXCLUDE_DOT_NAVIE_DIR, shouldMatch: true },
20+
{ path: '/non-matching/test/file', pattern: /some_other_regex/, shouldMatch: false },
2021
];
2122

2223
testCases.forEach(({ path, pattern, shouldMatch }) => {

0 commit comments

Comments
 (0)