Skip to content

Commit 25743de

Browse files
committed
Add basic import from files
1 parent 3c63951 commit 25743de

File tree

5 files changed

+56
-0
lines changed

5 files changed

+56
-0
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#include "vector_data_generator.h"
2+
3+
namespace NYdbWorkload {
4+
5+
TWorkloadVectorFilesDataInitializer::TWorkloadVectorFilesDataInitializer(const TVectorWorkloadParams& params)
6+
: TWorkloadDataInitializerBase("files", "Import vectors from files", params)
7+
, Params(params)
8+
{ }
9+
10+
void TWorkloadVectorFilesDataInitializer::ConfigureOpts(NLastGetopt::TOpts& opts) {
11+
opts.AddLongOption('i', "input",
12+
"File or Directory with dataset. If directory is set, all its available files will be used. "
13+
"Supports zipped and unzipped csv, tsv files and parquet ones that may be downloaded here: "
14+
"https://huggingface.co/datasets/Cohere/wikipedia-22-12-simple-embeddings. "
15+
"For better perfomanse you may split it to some parts for parrallel upload."
16+
).Required().StoreResult(&DataFiles);
17+
}
18+
19+
TBulkDataGeneratorList TWorkloadVectorFilesDataInitializer::DoGetBulkInitialData() {
20+
return {
21+
std::make_shared<TDataGenerator>(*this, Params.TableName, 0, Params.TableName, DataFiles, Default<TVector<TString>>(), TDataGenerator::EPortionSizeUnit::Line)
22+
};
23+
}
24+
25+
} // namespace NYdbWorkload
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#pragma once
2+
3+
#include "vector_workload_params.h"
4+
5+
#include <ydb/library/workload/benchmark_base/workload.h>
6+
#include <ydb/library/workload/benchmark_base/data_generator.h>
7+
8+
namespace NYdbWorkload {
9+
10+
class TWorkloadVectorFilesDataInitializer : public TWorkloadDataInitializerBase {
11+
private:
12+
const TVectorWorkloadParams& Params;
13+
TString DataFiles;
14+
15+
public:
16+
TWorkloadVectorFilesDataInitializer(const TVectorWorkloadParams& params);
17+
18+
virtual void ConfigureOpts(NLastGetopt::TOpts& opts) override;
19+
virtual TBulkDataGeneratorList DoGetBulkInitialData() override;
20+
};
21+
22+
} // namespace NYdbWorkload

ydb/library/workload/vector/vector_workload_params.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#include "vector_data_generator.h"
12
#include "vector_enums.h"
23
#include "vector_workload_params.h"
34
#include "vector_workload_generator.h"
@@ -193,6 +194,12 @@ THolder<IWorkloadQueryGenerator> TVectorWorkloadParams::CreateGenerator() const
193194
return MakeHolder<TVectorWorkloadGenerator>(this);
194195
}
195196

197+
TWorkloadDataInitializer::TList TVectorWorkloadParams::CreateDataInitializers() const {
198+
return {
199+
std::make_shared<TWorkloadVectorFilesDataInitializer>(*this)
200+
};
201+
}
202+
196203
TString TVectorWorkloadParams::GetWorkloadName() const {
197204
return "vector";
198205
}

ydb/library/workload/vector/vector_workload_params.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class TVectorWorkloadParams final: public TWorkloadBaseParams {
1818
public:
1919
void ConfigureOpts(NLastGetopt::TOpts& opts, const ECommandType commandType, int workloadType) override;
2020
THolder<IWorkloadQueryGenerator> CreateGenerator() const override;
21+
TWorkloadDataInitializer::TList CreateDataInitializers() const override;
2122
TString GetWorkloadName() const override;
2223
void Validate(const ECommandType commandType, int workloadType) override;
2324

ydb/library/workload/vector/ya.make

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ LIBRARY()
22

33
SRCS(
44
vector_command_index.cpp
5+
vector_data_generator.cpp
56
vector_recall_evaluator.cpp
67
vector_sampler.cpp
78
vector_sql.cpp

0 commit comments

Comments
 (0)