diff --git a/.gitignore b/.gitignore index cf75843f..0549d46a 100644 --- a/.gitignore +++ b/.gitignore @@ -84,3 +84,5 @@ sample_200x100.csv Raw_Web_Visit_sample.csv Raw_Web_Visit_Sample.csv Raw_Web_Visit_Sample.csv +app/test_models.py +credit_card_example.json diff --git a/alembic/versions/2b4e8d9f6c3a_add_completed_rows.py b/alembic/versions/2b4e8d9f6c3a_add_completed_rows.py new file mode 100644 index 00000000..5f1d56d9 --- /dev/null +++ b/alembic/versions/2b4e8d9f6c3a_add_completed_rows.py @@ -0,0 +1,30 @@ +"""add_completed_rows + +Revision ID: 2b4e8d9f6c3a +Revises: 1a8fdc23eb6f +Create Date: 2025-01-18 10:30:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '2b4e8d9f6c3a' +down_revision: Union[str, None] = '1a8fdc23eb6f' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Add completed_rows column to generation_metadata table + with op.batch_alter_table('generation_metadata', schema=None) as batch_op: + batch_op.add_column(sa.Column('completed_rows', sa.Integer(), nullable=True)) + + +def downgrade() -> None: + # Remove completed_rows column from generation_metadata table + with op.batch_alter_table('generation_metadata', schema=None) as batch_op: + batch_op.drop_column('completed_rows') \ No newline at end of file diff --git a/app/client/src/Container.tsx b/app/client/src/Container.tsx index a4ea87bf..7f8cfda5 100644 --- a/app/client/src/Container.tsx +++ b/app/client/src/Container.tsx @@ -77,6 +77,25 @@ const pages: MenuItem[] = [ {LABELS[Pages.GENERATOR]} ), }, + { + key: Pages.DATASETS, + label: ( + {LABELS[Pages.DATASETS]} + ), + }, + { + key: Pages.EVALUATIONS, + label: ( + {LABELS[Pages.EVALUATIONS]} + ), + }, + { + key: Pages.EXPORTS, + label: ( + {LABELS[Pages.EXPORTS]} + ), + }, + // { // key: Pages.TELEMETRY, // label: ( @@ -107,7 +126,7 @@ const pages: MenuItem[] = [ - {`SDS-1.0.2`} + {`SDS-1.0.3`} diff --git a/app/client/src/api/Datasets/response.ts b/app/client/src/api/Datasets/response.ts index f3738346..c58ab011 100644 --- a/app/client/src/api/Datasets/response.ts +++ b/app/client/src/api/Datasets/response.ts @@ -22,6 +22,7 @@ export type DatasetResponse = { job_name: string; job_status: string; inference_type: string; + completed_rows: number; // Add this line }; export type ModelParameters = { diff --git a/app/client/src/assets/ic-arrow-right-light.svg b/app/client/src/assets/ic-arrow-right-light.svg new file mode 100644 index 00000000..b783f56e --- /dev/null +++ b/app/client/src/assets/ic-arrow-right-light.svg @@ -0,0 +1,3 @@ + + + diff --git a/app/client/src/assets/ic-brand-alternative-data.svg b/app/client/src/assets/ic-brand-alternative-data.svg new file mode 100644 index 00000000..d8bae68c --- /dev/null +++ b/app/client/src/assets/ic-brand-alternative-data.svg @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/app/client/src/assets/ic-brand-inventory-ordering.svg b/app/client/src/assets/ic-brand-inventory-ordering.svg new file mode 100644 index 00000000..da494765 --- /dev/null +++ b/app/client/src/assets/ic-brand-inventory-ordering.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/app/client/src/assets/ic-brand-iot.svg b/app/client/src/assets/ic-brand-iot.svg new file mode 100644 index 00000000..67af08fb --- /dev/null +++ b/app/client/src/assets/ic-brand-iot.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/app/client/src/assets/ic-data-augmentation.svg b/app/client/src/assets/ic-data-augmentation.svg new file mode 100644 index 00000000..38e47ea7 --- /dev/null +++ b/app/client/src/assets/ic-data-augmentation.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/app/client/src/components/JobStatus/jobStatusIcon.tsx b/app/client/src/components/JobStatus/jobStatusIcon.tsx index b34b9b26..c9511a33 100644 --- a/app/client/src/components/JobStatus/jobStatusIcon.tsx +++ b/app/client/src/components/JobStatus/jobStatusIcon.tsx @@ -1,6 +1,7 @@ import { Tooltip } from "antd"; import { CheckCircleTwoTone, ExclamationCircleTwoTone, InfoCircleTwoTone, LoadingOutlined } from '@ant-design/icons'; import { JobStatus } from "../../types"; +import styled from "styled-components"; export type JobStatusProps = { status: JobStatus @@ -17,25 +18,46 @@ const defaultTooltipTitles: Record = { 'null': 'No job was executed' } +const IconWrapper = styled.div` + svg { + font-size: 20px; + } +` + + export default function JobStatusIcon({ status, customTooltipTitles }: JobStatusProps) { const tooltipTitles = {...defaultTooltipTitles, ...customTooltipTitles}; function jobStatus() { switch (status) { case "ENGINE_SUCCEEDED": - return ; + return + + ; case 'ENGINE_STOPPED': - return ; + return + + ; case 'ENGINE_TIMEDOUT': - return ; + return + + ; case 'ENGINE_SCHEDULING': - return ; + return + + ; case 'ENGINE_RUNNING': - return ; + return + + ; case null: - return ; + return + + ; default: - return ; + return + + ; } } diff --git a/app/client/src/constants.ts b/app/client/src/constants.ts index 2849cc17..b5f201bc 100644 --- a/app/client/src/constants.ts +++ b/app/client/src/constants.ts @@ -8,6 +8,8 @@ export const LABELS = { [Pages.GENERATOR]: 'Generation', [Pages.EVALUATOR]: 'Evaluator', [Pages.DATASETS]: 'Datasets', + [Pages.EVALUATIONS]: 'Evaluations', + [Pages.EXPORTS]: 'Exports', [Pages.HISTORY]: 'History', [Pages.FEEDBACK]: 'Feedback', //[Pages.TELEMETRY]: 'Telemetry', @@ -19,7 +21,11 @@ export const LABELS = { export const TRANSLATIONS: Record = { "code_generation": "Code Generation", - "text2sql": "Text to SQL" + "text2sql": "Text to SQL", + "custom": "Custom", + "lending_data": "Lending Data", + "credit_card_data": "Credit Card Data", + "ticketing_dataset": "Ticketing Dataset" }; export const CDSW_PROJECT_URL = import.meta.env.VITE_CDSW_PROJECT_URL; diff --git a/app/client/src/pages/DataGenerator/Configure.tsx b/app/client/src/pages/DataGenerator/Configure.tsx index a68bec84..85f196bb 100644 --- a/app/client/src/pages/DataGenerator/Configure.tsx +++ b/app/client/src/pages/DataGenerator/Configure.tsx @@ -1,15 +1,18 @@ import endsWith from 'lodash/endsWith'; import isEmpty from 'lodash/isEmpty'; import isFunction from 'lodash/isFunction'; -import { useEffect, useState } from 'react'; -import { Flex, Form, Input, Select, Typography } from 'antd'; +import { FunctionComponent, useEffect, useState } from 'react'; +import { Flex, Form, FormInstance, Input, Select, Typography } from 'antd'; import styled from 'styled-components'; import { File, WorkflowType } from './types'; import { useFetchModels } from '../../api/api'; import { MODEL_PROVIDER_LABELS } from './constants'; import { ModelProviders, ModelProvidersDropdownOpts } from './types'; -import { useWizardCtx } from './utils'; +import { getWizardModel, getWizardModeType, useWizardCtx } from './utils'; import FileSelectorButton from './FileSelectorButton'; +import UseCaseSelector from './UseCaseSelector'; +import { useLocation, useParams } from 'react-router-dom'; +import { WizardModeType } from '../../types'; const StepContainer = styled(Flex)` @@ -36,7 +39,7 @@ export const USECASE_OPTIONS = [ ]; export const WORKFLOW_OPTIONS = [ - { label: 'Supervised Fine-Tuning', value: 'supervised-fine-tuning' }, + // { label: 'Supervised Fine-Tuning', value: 'supervised-fine-tuning' }, { label: 'Custom Data Generation', value: 'custom' }, { label: 'Freeform Data Generation', value: 'freeform' } ]; @@ -46,9 +49,31 @@ export const MODEL_TYPE_OPTIONS: ModelProvidersDropdownOpts = [ { label: MODEL_PROVIDER_LABELS[ModelProviders.CAII], value: ModelProviders.CAII }, ]; -const Configure = () => { +const Configure: FunctionComponent = () => { const form = Form.useFormInstance(); const formData = Form.useWatch((values) => values, form); + const location = useLocation(); + const { template_name, generate_file_name } = useParams(); + const [wizardModeType, setWizardModeType] = useState(getWizardModeType(location)); + + useEffect(() => { + if (wizardModeType === WizardModeType.DATA_AUGMENTATION) { + setWizardModeType(WizardModeType.DATA_AUGMENTATION); + form.setFieldValue('workflow_type', 'freeform'); + } else { + setWizardModeType(WizardModeType.DATA_GENERATION); + form.setFieldValue('workflow_type', 'custom'); + } + }, [location, wizardModeType]); + + useEffect(() => { + if (template_name) { + form.setFieldValue('use_case', template_name); + } + }, [template_name]); + + + // let formData = Form.useWatch((values) => values, form); const { setIsStepValid } = useWizardCtx(); const { data } = useFetchModels(); const [selectedFiles, setSelectedFiles] = useState( @@ -72,10 +97,13 @@ const Configure = () => { validateForm() }, [form, formData]) - // keivan + useEffect(() => { - if (formData && formData?.inference_type === undefined) { + if (formData && formData?.inference_type === undefined && isEmpty(generate_file_name)) { form.setFieldValue('inference_type', ModelProviders.CAII); + setTimeout(() => { + form.setFieldValue('use_case','custom'); + }, 1000); } }, [formData]); @@ -140,8 +168,10 @@ const Configure = () => { label='Model Provider' rules={[{ required: true }]} labelCol={labelCol} + shouldUpdate > - {USECASE_OPTIONS.map(option => - - {option.label} - - )} - - } + } {( - formData?.workflow_type === WorkflowType.SUPERVISED_FINE_TUNING || - formData?.workflow_type === WorkflowType.CUSTOM_DATA_GENERATION) && + formData?.workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION || + formData?.use_case === 'custom') && + + prevValues.doc_paths !== currentValues.doc_paths || + prevValues.use_case !== currentValues.use_case + } + > + {({}) => { + const useCase = form.getFieldValue('use_case'); + if (useCase === 'custom') { + + } + return ( + ({ @@ -287,9 +316,9 @@ const Configure = () => { > @@ -328,6 +359,7 @@ const Configure = () => { name='output_value' label='Output Value' labelCol={labelCol} + tooltip='Enter the name for the generated values corresponding to each input. If left blank, this will default to “Completion”.' shouldUpdate > diff --git a/app/client/src/pages/DataGenerator/CustomPromptButton.tsx b/app/client/src/pages/DataGenerator/CustomPromptButton.tsx index d5781bef..7a725129 100644 --- a/app/client/src/pages/DataGenerator/CustomPromptButton.tsx +++ b/app/client/src/pages/DataGenerator/CustomPromptButton.tsx @@ -59,7 +59,6 @@ const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_en const [showModal, setShowModal] = useState(false); const [disabled, setDisabled] = useState(false); const custom_prompt_instructions = Form.useWatch('custom_prompt_instructions', { form, preserve: true }); - console.log('custom_prompt_instructions', custom_prompt_instructions); const mutation = useMutation({ mutationFn: fetchCustomPrompt @@ -158,7 +157,7 @@ const CustomPromptButton: React.FC = ({ model_id, inference_type, caii_en disabled={mutation.isPending} rows={15} autoSize - placeholder={'Enter instructions for a custom prompt'} + placeholder={'Generate prompt from the example data'} /> diff --git a/app/client/src/pages/DataGenerator/DataGenerator.tsx b/app/client/src/pages/DataGenerator/DataGenerator.tsx index bbf9b71f..36f640d1 100644 --- a/app/client/src/pages/DataGenerator/DataGenerator.tsx +++ b/app/client/src/pages/DataGenerator/DataGenerator.tsx @@ -1,6 +1,6 @@ import isEmpty from 'lodash/isEmpty'; import isString from 'lodash/isString'; -import { useEffect, useRef, useState } from 'react'; +import { FunctionComponent, useEffect, useRef, useState } from 'react'; import { useLocation, useParams } from 'react-router-dom'; import { Button, Flex, Form, Layout, Steps } from 'antd'; @@ -20,10 +20,15 @@ import { DataGenWizardSteps, WizardStepConfig, WorkflowType } from './types'; import { WizardCtx } from './utils'; import { fetchDatasetDetails, useGetDatasetDetails } from '../DatasetDetails/hooks'; import { useMutation } from '@tanstack/react-query'; +import { WizardModeType } from '../../types'; const { Content } = Layout; // const { Title } = Typography; +interface Props { + mode?: WizardModeType; +} + const StyledTitle = styled.div` margin-top: 10px; font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; @@ -62,48 +67,22 @@ const WizardFooter = styled(Flex)` `; -const steps: WizardStepConfig[] = [ - { - title: 'Configure', - key: DataGenWizardSteps.CONFIGURE, - content: , - required: true, - }, - { - title: 'Examples', - key: DataGenWizardSteps.EXAMPLES, - content: - }, - { - title: 'Prompt', - key: DataGenWizardSteps.PROMPT, - content: , - }, - { - title: 'Summary', - key: DataGenWizardSteps.SUMMARY, - content: - }, - { - title: 'Finish', - key: DataGenWizardSteps.FINISH, - content: - }, - -]; + /** * Wizard component for Synthetic Data Generation workflow */ -const DataGenerator = () => { +const DataGenerator: FunctionComponent = () => { const [current, setCurrent] = useState(0); const [maxStep, setMaxStep] = useState(0); const [isStepValid, setIsStepValid] = useState(false); + // Data passed from listing table to prepopulate form const location = useLocation(); const { generate_file_name } = useParams(); const initialData = location?.state?.data; + const mutation = useMutation({ mutationFn: fetchDatasetDetails }); @@ -113,14 +92,21 @@ const DataGenerator = () => { if (generate_file_name && !mutation.data) { mutation.mutate(generate_file_name); } + }, [generate_file_name]); + + useEffect(() => { if (mutation.data && mutation?.data?.dataset) { - form.setFieldsValue({ + const dataset = mutation?.data?.dataset as any; + const values = { ...initialData, - ...(mutation?.data?.dataset as any) - }); + ...dataset, + workflow_type: dataset.technique === 'freeform' ? + WorkflowType.FREE_FORM_DATA_GENERATION : WorkflowType.CUSTOM_DATA_GENERATION + } + form.setFieldsValue(values); + formData.current = values; } - - }, [generate_file_name]); + }, [mutation.data]); if (initialData?.technique) { @@ -152,11 +138,43 @@ const DataGenerator = () => { initialData.doc_paths = []; } - const formData = useRef(initialData || { num_questions: 20, topics: [] }); const [form] = Form.useForm(); + + + + const steps: WizardStepConfig[] = [ + { + title: 'Configure', + key: DataGenWizardSteps.CONFIGURE, + content: , + required: true, + }, + { + title: 'Examples', + key: DataGenWizardSteps.EXAMPLES, + content: + }, + { + title: 'Prompt', + key: DataGenWizardSteps.PROMPT, + content: , + }, + { + title: 'Summary', + key: DataGenWizardSteps.SUMMARY, + content: + }, + { + title: 'Finish', + key: DataGenWizardSteps.FINISH, + content: + }, + + ]; + const onStepChange = (value: number) => { setCurrent(value); }; @@ -168,12 +186,12 @@ const DataGenerator = () => { } }; - const prev = () => setCurrent(Math.max(0, current - 1)) + const prev = () => setCurrent(Math.max(0, current - 1)); return ( - {'Synthetic Dataset Studio'} + {'Configure Synthetic Dataset'} { +const Examples: FunctionComponent = () => { const form = Form.useFormInstance(); - const [exampleType, setExampleType] = useState(ExampleType.PROMPT_COMPLETION); - + const { generate_file_name } = useParams(); + const [records, setRecords] = useState[]>([]); + const workflowType = form.getFieldValue('workflow_type'); + const mutation = useMutation({ mutationFn: fetchFileContent }); - const values = form.getFieldsValue(true) + + const restore_mutation = useMutation({ + mutationFn: fetchExamplesByUseCase + }); useEffect(() => { - const example_path = form.getFieldValue('example_path'); + if (isEmpty(generate_file_name)) { + const useCase = form.getFieldValue('use_case'); + restore_mutation.mutate(useCase); + } else { + setRecords(form.getFieldValue('examples')); + } + }, [form.getFieldValue('use_case'), generate_file_name]); + + useEffect(() => { + const example_path = form.getFieldValue('example_path'); if (!isEmpty(example_path)) { mutation.mutate({ path: example_path }); } - - if (form.getFieldValue('workflow_type') === 'freeform') { - setExampleType(ExampleType.FREE_FORM); - } - - - }, [form.getFieldValue('example_path'), form.getFieldValue('workflow_type')]); - useEffect(() => { + useEffect(() => { if (!isEmpty(mutation.data)) { form.setFieldValue('examples', mutation.data); + if (!isEqual(mutation.data, records)) { + setRecords(mutation.data); + } + } }, [mutation.data]); - const columns = [ - { - title: 'Prompts', - dataIndex: 'question', - ellipsis: true, - render: (_text: QuestionSolution, record: QuestionSolution) => <>{record.question} - }, - { - title: 'Completions', - dataIndex: 'solution', - ellipsis: true, - render: (_text: QuestionSolution, record: QuestionSolution) => <>{record.solution} - }, - { - title: 'Actions', - key: 'actions', - width: 130, - render: (_text: QuestionSolution, record: QuestionSolution, index: number) => { - const { question, solution } = record; - const editRow = (data: QuestionSolution) => { - const updatedExamples = [...form.getFieldValue('examples')]; - updatedExamples.splice(index, 1, data); - form.setFieldValue('examples', updatedExamples); - Modal.destroyAll() - } - const deleteRow = () => { - const updatedExamples = [...form.getFieldValue('examples')]; - updatedExamples.splice(index, 1); - form.setFieldValue('examples', updatedExamples); - } - return ( - - - - - ), - maskClosable: true, - width: 1000 - }) - }} - /> - - ) - }}, - ]; - const dataSource = Form.useWatch('examples', form); - const { data: examples, loading: examplesLoading } = useFetchExamples(form.getFieldValue('use_case')); - if (!dataSource && examples) { - form.setFieldValue('examples', examples.examples) + useEffect(() => { + if (!isEmpty(restore_mutation.data)) { + const examples = get(restore_mutation.data, 'examples', []); + form.setFieldValue('examples', examples); + setRecords(examples); + } + }, [restore_mutation.data]); + + useEffect(() => { + if (generate_file_name) { + setRecords(form.getFieldValue('examples')); + } + }, [generate_file_name]); + + const onRestoreDefaults = async() => { + const useCase = form.getFieldValue('use_case'); + restore_mutation.mutate(useCase); } - const rowLimitReached = form.getFieldValue('examples')?.length === MAX_EXAMPLES; - const workflowType = form.getFieldValue('workflow_type'); const onAddFiles = (files: File[]) => { if (!isEmpty (files)) { @@ -199,7 +124,6 @@ const Examples: React.FC = () => { ...values, example_path: get(file, '_path') }); - setExampleType(ExampleType.FREE_FORM); } } @@ -207,8 +131,16 @@ const Examples: React.FC = () => { span: 10 }; + const showEmptyState = (workflowType === WorkflowType.FREE_FORM_DATA_GENERATION && + isEmpty(mutation.data) && + records.length === 0) || + (form.getFieldValue('use_case') === 'custom' && + isEmpty(form.getFieldValue('examples'))); + + return ( + {mutation?.isPending || restore_mutation.isPending && }
@@ -217,7 +149,7 @@ const Examples: React.FC = () => { - {workflowType === WorkflowType.FREE_FORM_DATA_GENERATION && + {(workflowType === WorkflowType.FREE_FORM_DATA_GENERATION || workflowType === WorkflowType.CUSTOM_DATA_GENERATION) && <> { } - - {exampleType !== ExampleType.FREE_FORM && } - - {exampleType !== ExampleType.FREE_FORM && - - - } +
- {exampleType === ExampleType.FREE_FORM && !isEmpty(mutation.data) && - } - {exampleType === ExampleType.FREE_FORM && isEmpty(mutation.data) && !isEmpty(values.examples) && - } - {exampleType === ExampleType.FREE_FORM && isEmpty(mutation.data) && isEmpty(values.examples) && + {!isEmpty(records) && } + {showEmptyState && ( - - - } - imageStyle={{ - height: 60, - marginBottom: 24 - }} - description={ - <> -

- Upload a JSON file containing examples -

-

- {'Examples should be in the format of a JSON array containing array of key & value pairs. The key should be the column name and the value should be the cell value.'} -

- - } - > -
- } - {exampleType !== ExampleType.FREE_FORM && - - ({ - onClick: () => Modal.info({ - title: 'View Details', - content: , - icon: undefined, - maskClosable: true, - width: 1000 - }) - })} - rowClassName={() => 'hover-pointer'} - rowKey={(_record, index) => `examples-table-${index}`} - /> - } - + image={ + + + + } + imageStyle={{ + height: 60, + marginBottom: 24 + }} + description={ + <> +

+ {`Upload a JSON file containing examples`} +

+

+ {'Examples should be in the format of a JSON array containing array of key & value pairs. The key should be the column name and the value should be the cell value.'} +

+ + } + /> + )}
) }; diff --git a/app/client/src/pages/DataGenerator/FileSelectorButton.tsx b/app/client/src/pages/DataGenerator/FileSelectorButton.tsx index b8e6f880..a75a301b 100644 --- a/app/client/src/pages/DataGenerator/FileSelectorButton.tsx +++ b/app/client/src/pages/DataGenerator/FileSelectorButton.tsx @@ -9,9 +9,10 @@ interface Props { onAddFiles: (files: File[]) => void; workflowType: WorkflowType; label?: string; + allowFileTypes?: string[]; } -const FileSelectorButton: React.FC = ({ onAddFiles, workflowType, label }) => { +const FileSelectorButton: React.FC = ({ onAddFiles, workflowType, label, allowFileTypes }) => { const [showModal, setShowModal] = useState(false); const [selectedFiles, setSelectedFiles] = useState([]) @@ -43,7 +44,7 @@ const FileSelectorButton: React.FC = ({ onAddFiles, workflowType, label } onOk={() => onFinish()} width="60%" > - + )} diff --git a/app/client/src/pages/DataGenerator/FilesTable.tsx b/app/client/src/pages/DataGenerator/FilesTable.tsx index 5636655b..0fadbf7a 100644 --- a/app/client/src/pages/DataGenerator/FilesTable.tsx +++ b/app/client/src/pages/DataGenerator/FilesTable.tsx @@ -3,6 +3,7 @@ import filter from 'lodash/filter'; import clone from 'lodash/clone'; import set from 'lodash/set'; import forEach from 'lodash/forEach'; +import isString from 'lodash/isString'; import React, { useEffect, useState } from 'react'; import { Badge, Breadcrumb, Button, Col, Flex, List, Popover, Row, Table, Tooltip, Typography } from 'antd'; import styled from 'styled-components'; @@ -19,6 +20,7 @@ interface Props { workflowType: WorkflowType; files: File[]; onSelectedRows: (selectedRows: File[]) => void; + allowFileTypes?: string[]; } const StyledTable = styled(Table)` @@ -78,8 +80,14 @@ export const getSelectedRows = (fileSelectionMap: FileSelectionMap) => { return rows; } +export function getFileExtension(filename: string): string | null { + const lastDot = filename.lastIndexOf('.'); + if (lastDot === -1 || lastDot === 0) return null; + return filename.slice(lastDot + 1).toLowerCase(); +} + -const FilesTable: React.FC = ({ onSelectedRows, workflowType }) => { +const FilesTable: React.FC = ({ onSelectedRows, workflowType, allowFileTypes }) => { const [paths, setPaths] = useState(null); const [path, setPath] = useState(null); const [selectedRowKeys, setSelectedRowKeys] = useState([]); @@ -106,7 +114,12 @@ const FilesTable: React.FC = ({ onSelectedRows, workflowType }) => { if (isDirectory(record)) { return true; } - + if (Array.isArray(allowFileTypes) && !isEmpty(allowFileTypes)) { + const extension = getFileExtension(record.name); + if (isString(extension)) { + return !allowFileTypes.includes(extension as string); + } + } if (workflowType === WorkflowType.SUPERVISED_FINE_TUNING) { return !endsWith(record.name, '.pdf'); } else if (workflowType === WorkflowType.CUSTOM_DATA_GENERATION) { diff --git a/app/client/src/pages/DataGenerator/ImportExamples.tsx b/app/client/src/pages/DataGenerator/ImportExamples.tsx new file mode 100644 index 00000000..f3645597 --- /dev/null +++ b/app/client/src/pages/DataGenerator/ImportExamples.tsx @@ -0,0 +1,12 @@ +import React from 'react'; + +const ImportExamples: React.FC = () => { + return ( +
+

Import Examples

+

This is a placeholder for importing examples. Functionality will be added soon.

+
+ ); +}; + +export default ImportExamples; \ No newline at end of file diff --git a/app/client/src/pages/DataGenerator/Prompt.tsx b/app/client/src/pages/DataGenerator/Prompt.tsx index 45f8b798..8fa9c98b 100644 --- a/app/client/src/pages/DataGenerator/Prompt.tsx +++ b/app/client/src/pages/DataGenerator/Prompt.tsx @@ -226,7 +226,6 @@ const Prompt = () => { } }; } - console.log('mutation data:', mutation); return ( @@ -294,7 +293,7 @@ const Prompt = () => { - {((workflow_type === WorkflowType.CUSTOM_DATA_GENERATION && !isEmpty(doc_paths)) || + {((workflow_type === WorkflowType.FREE_FORM_DATA_GENERATION && !isEmpty(doc_paths)) || (workflow_type === WorkflowType.SUPERVISED_FINE_TUNING && !isEmpty(doc_paths))) && ; +} + + +const UseCaseSelector: FunctionComponent = ({ form }) => { + const [useCases, setUseCases] = useState([]); + const useCasesReq = useGetUseCases(); + + useEffect(() => { + if (useCasesReq.data) { + let _useCases = get(useCasesReq, 'data.usecases', []); + _useCases = _useCases.map((useCase: any) => ({ + ...useCase, + label: useCase.name, + value: useCase.id + })); + setUseCases(_useCases); + } + }, [useCasesReq.data]); + + const onChange = (value: string) => { + form.setFieldValue('use_case', value); + if (value !== 'custom') { + form.setFieldValue('example_path', null); + form.setFieldValue('examples', []); + } + } + + + return ( + + + + ); +} + +export default UseCaseSelector; \ No newline at end of file diff --git a/app/client/src/pages/DataGenerator/hooks.ts b/app/client/src/pages/DataGenerator/hooks.ts index 2cb1dc7e..2980205b 100644 --- a/app/client/src/pages/DataGenerator/hooks.ts +++ b/app/client/src/pages/DataGenerator/hooks.ts @@ -4,7 +4,8 @@ import toNumber from 'lodash/toNumber'; import isEmpty from 'lodash/isEmpty'; import isString from 'lodash/isString'; import { useMutation, useQuery } from '@tanstack/react-query'; -import { WorkflowType } from './types'; +import { ExampleType, WorkflowType } from './types'; +import { first } from 'lodash'; const BASE_API_URL = import.meta.env.VITE_AMP_URL; @@ -244,4 +245,84 @@ export const useDatasetSize = ( isError, error }; - } \ No newline at end of file + } + + export const fetchUseCases = async () => { + const resp = await fetch(`${BASE_API_URL}/use-cases`, { + method: 'GET' + }); + const body = await resp.json(); + return body; +} + +export const useGetUseCases = () => { + const { data, isLoading, isError, error, isFetching } = useQuery( + { + queryKey: ['useCases'], + queryFn: () => fetchUseCases(), + refetchOnWindowFocus: false, + } + ); + return { + data, + isLoading: isLoading || isFetching, + isError, + error + }; +} + +export const fetchExamplesByUseCase = async (use_case: string) => { + const resp = await fetch(`${BASE_API_URL}/${isEmpty(use_case) ? 'custom' : use_case}/gen_examples`, { + method: 'GET' + }); + const body = await resp.json(); + return body; +} + +export const useGetExamplesByUseCase = (use_case: string) => { + const { data, isLoading, isError, error, isFetching, refetch } = useQuery( + { + queryKey: ['fetchUseCaseTopics', fetchExamplesByUseCase], + queryFn: () => fetchExamplesByUseCase(use_case), + refetchOnWindowFocus: false, + } + ); + + if (isError) { + notification.error({ + message: 'Error', + description: `An error occurred while fetching the use case examples.\n ${error?.message}` + }); + } + + + let examples = []; + let exmpleFormat: ExampleType | null = null; + if (!isEmpty(data) && !isEmpty(data?.examples)) { + examples = get(data, 'examples', []); + exmpleFormat = getExampleType(examples); + } + + return { + data, + isLoading: isLoading || isFetching, + isError, + error, + examples, + exmpleFormat, + refetch + }; +} + +export const getExampleType = (data: object[]) => { + if (!isEmpty(data)) { + const row = first(data); + const keys = Object.keys(row as object); + if (keys.length === 2) { + return ExampleType.PROMPT_COMPLETION; + } + return ExampleType.FREE_FORM; + } + return null; +} + diff --git a/app/client/src/pages/DataGenerator/types.ts b/app/client/src/pages/DataGenerator/types.ts index cb029ed2..73c64b20 100644 --- a/app/client/src/pages/DataGenerator/types.ts +++ b/app/client/src/pages/DataGenerator/types.ts @@ -119,4 +119,9 @@ export enum TechniqueType { SFT = 'sft', CUSTOME_WORKFLOW = 'custom_workflow', FREE_FORM = 'freeform' +} + +export enum ExampleType { + FREE_FORM = 'freeform', + PROMPT_COMPLETION = 'promptcompletion' } \ No newline at end of file diff --git a/app/client/src/pages/DataGenerator/utils.ts b/app/client/src/pages/DataGenerator/utils.ts index 2e10b648..699ce95e 100644 --- a/app/client/src/pages/DataGenerator/utils.ts +++ b/app/client/src/pages/DataGenerator/utils.ts @@ -3,6 +3,7 @@ import { WizardCtxObj } from './types'; import moment from 'moment'; import toString from 'lodash/toString'; import { File } from './types'; +import { WizardModeType } from '../../types'; export const WizardCtx = createContext(null); export const useWizardCtx = (): WizardCtxObj => { @@ -105,3 +106,15 @@ export const getHttpStatusCodeVerb = (statusCode: number) => { return null; } }; + +export const getWizardModeType = (location: any) => { + const pathname = location?.pathname || ''; + switch (pathname) { + case '/data-augmentation': + return WizardModeType.DATA_AUGMENTATION; + case '/data-generator': + return WizardModeType.DATA_GENERATION; + default: + return null; + } +} diff --git a/app/client/src/pages/Datasets/DatasetActions.tsx b/app/client/src/pages/Datasets/DatasetActions.tsx new file mode 100644 index 00000000..f33b9f7f --- /dev/null +++ b/app/client/src/pages/Datasets/DatasetActions.tsx @@ -0,0 +1,148 @@ +import isEmpty from 'lodash/isEmpty'; +import React from 'react'; +import { Button, Dropdown, Flex, MenuProps, Modal, Space, Typography } from "antd"; +import { Dataset } from "../Evaluator/types"; +import styled from "styled-components"; +import { ExportOutlined, FolderViewOutlined, ThunderboltOutlined } from '@ant-design/icons'; +import FindInPageIcon from '@mui/icons-material/FindInPage'; +import QueryStatsIcon from '@mui/icons-material/QueryStats'; +import MoreVertIcon from '@mui/icons-material/MoreVert'; +import DeleteIcon from '@mui/icons-material/Delete'; +import { Link } from "react-router-dom"; +import { useDeleteDataset } from '../../api/Datasets/datasets'; +import { useState } from 'react'; +import DatasetDetailModal from '../../components/Datasets/DatasetDetails/DatasetDetailModal'; +import { DatasetResponse } from '../../api/Datasets/response'; +import { getFilesURL } from '../Evaluator/util'; +import { Pages } from "../../types"; + +const { Text } = Typography; + +const ButtonGroup = styled(Flex)` + margin-top: 15px !important; +` + +interface DatasetActionsProps { + dataset: Dataset; + refetch: () => void; + setToggleDatasetExportModal: (toggle: boolean) => void; +} + + +const DatasetActions: React.FC = ({ dataset, refetch, setToggleDatasetExportModal }) => { + const [showModal, setShowModal] = useState(false); + const deleteDatasetReq = useDeleteDataset(); + + const deleteConfirmWarningModal = (row: Dataset) => { + return Modal.warning({ + title: 'Remove Dataset', + closable: true, + content: ( + <> + + {`Are you sure you want to remove this dataset`} {row.display_name}? + + + ), + icon: undefined, + footer: ( + + + + + ), + maskClosable: true, + width: "20%" + }) + } + + async function handleDeleteEvaluationConfirm() { + await deleteDatasetReq.triggerDelete(dataset?.generate_file_name, `file_path=${dataset?.local_export_path}`); + // await datasetHistoryAPI.triggerGet(); + refetch(); + } + + const menuActions: MenuProps['items'] = [ + { + key: '1', + label: + + View Dataset Details + , + icon: + }, + { + key: '2', + label: ( + + View in Preview + + ), + icon: , + }, + { + key: '3', + label: ( + + Generate Dataset + + ), + icon: , + }, + { + key: '4', + label: ( + + Evaluate Dataset + + ), + icon: , + }, + { + key: '5', + label: ( + + Export Dataset + + ), + onClick: () => setToggleDatasetExportModal(true), + icon: + }, + { + key: '6', + label: ( + deleteConfirmWarningModal(dataset)}>Remove Dataset + ), + icon: + } + ]; + + return ( + <> + + + + + + + + {showModal && } + + + ) +} + +export default DatasetActions; \ No newline at end of file diff --git a/app/client/src/pages/Datasets/DatasetsPage.tsx b/app/client/src/pages/Datasets/DatasetsPage.tsx new file mode 100644 index 00000000..b36c85d7 --- /dev/null +++ b/app/client/src/pages/Datasets/DatasetsPage.tsx @@ -0,0 +1,213 @@ +import throttle from 'lodash/throttle'; +import React, { SyntheticEvent, useEffect } from 'react'; +import { Col, Flex, Input, Layout, Row, Table, TableProps, Tooltip, notification } from 'antd'; +import styled from 'styled-components'; +import Paragraph from 'antd/es/typography/Paragraph'; +import { useDatasets } from '../Home/hooks'; +import { ExportResult } from '../../components/Export/ExportModal'; +import { SearchProps } from 'antd/es/input'; +import Loading from '../Evaluator/Loading'; +import { Dataset } from '../Evaluator/types'; +import { JOB_EXECUTION_TOTAL_COUNT_THRESHOLD, TRANSLATIONS } from '../../constants'; +import DatasetExportModal, { ExportResult } from '../../components/Export/ExportModal'; +import DateTime from '../../components/DateTime/DateTime'; +import DatasetActions from './DatasetActions'; +import { sortItemsByKey } from '../../utils/sortutils'; + +import { JobStatus } from '../../types'; +import JobStatusIcon from '../../components/JobStatus/jobStatusIcon'; +import StyledTitle from '../Evaluator/StyledTitle'; + +const { Content } = Layout; +const { Search } = Input; + +const StyledContent = styled(Content)` + padding: 24px; + background-color: #f5f7f8; +`; + +const Container = styled.div` + background-color: #ffffff; + padding: 1rem; + overflow-x: auto; +`; + +const StyledTable = styled(Table)` + font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; + color: #5a656d; + .ant-table-thead > tr > th { + color: #5a656d; + border-bottom: 1px solid #eaebec; + font-weight: 500; + text-align: left; + // background: #ffffff; + border-bottom: 1px solid #eaebec; + transition: background 0.3s ease; + } + .ant-table-row > td.ant-table-cell { + padding: 8px; + padding-left: 16px; + font-size: 13px; + font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; + color: #5a656d; + .ant-typography { + font-size: 13px; + font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; + } + } +`; + +const StyledParagraph = styled(Paragraph)` + font-size: 13px; + font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; + color: #5a656d; +`; + +const DatasetsPage: React.FC = () => { + const { data, isLoading, isError, refetch, setSearchQuery, pagination } = useDatasets(); + const [notificationInstance, notificationContextHolder] = notification.useNotification(); + const [exportResult, setExportResult] = React.useState(); + const [toggleDatasetExportModal, setToggleDatasetExportModal] = React.useState(false); + const [datasetDetails, setDatasetDetails] = React.useState({} as Dataset); + + useEffect(() => { + if (isError) { + notification.error({ + message: 'Error', + description: 'An error occurred while fetching datasets' + }); + } + }, [isError]); + + useEffect(() => { + if (exportResult?.successMessage) { + notificationInstance.success({ + message: `Dataset Exported to Huggingface`, + description: "Dataset has been successfully exported." + }); + } + if (exportResult?.failedMessage) { + notificationInstance.error({ + message: "Error Exporting Dataset", + description: "There was an error exporting the dataset. Please try again." + }); + } + }, [exportResult, notificationInstance]) + + const onSearch: SearchProps['onSearch'] = (value: unknown) => { + throttle((value: string) => setSearchQuery(value), 500)(value); + } + + const onChange = (event: SyntheticEvent) => { + const value = (event.target as HTMLInputElement)?.value; + throttle((value: string) => setSearchQuery(value), 500)(value); + } + + const columns: TableProps['columns'] = [ + { + key: 'job_status', + title: 'Status', + dataIndex: 'job_status', + width: 80, + sorter: sortItemsByKey('job_status'), + render: (status: JobStatus) => + + + }, + { + key: 'display_name', + title: 'Display Name', + dataIndex: 'display_name', + width: 140, + sorter: sortItemsByKey('display_name') + }, { + key: 'generate_file_name', + title: 'Dataset Name', + dataIndex: 'generate_file_name', + width: 250, + sorter: sortItemsByKey('generate_file_name'), + render: (generate_file_name) => {generate_file_name} + }, { + key: 'model_id', + title: 'Model', + dataIndex: 'model_id', + width: 250, + sorter: sortItemsByKey('model_id'), + render: (modelId) => {modelId} + }, { + key: 'num_questions', + title: 'Questions Per Topic', + dataIndex: 'num_questions', + width: 120, + align: 'center', + sorter: sortItemsByKey('num_questions') + }, + { + key: 'total_count', + title: 'Total Count', + dataIndex: 'total_count', + width: 80, + align: 'center', + sorter: sortItemsByKey('total_count') + }, { + key: 'use_case', + title: 'Use Case', + dataIndex: 'use_case', + width: 120, + sorter: sortItemsByKey('use_case'), + render: (useCase) => TRANSLATIONS[useCase] + }, { + key: 'timestamp', + title: 'Creation Time', + dataIndex: 'timestamp', + defaultSortOrder: 'descend', + width: 120, + sorter: sortItemsByKey('timestamp'), + render: (timestamp) => <>{timestamp == null ? 'N/A' : } + }, { + key: '7', + title: 'Actions', + width: 100, + render: (row: Dataset) => ( + + ) + }, + ]; + return ( + + + {'Datasets'} + + + + + + + {isLoading && } + `${row?.display_name}_${row?.generate_file_name}`} + tableLayout="fixed" + pagination={pagination} + columns={columns} + dataSource={data?.data || [] as Dataset[]} + onRow={(row: Dataset) => + ({ + onClick: () => { + setDatasetDetails(row); + } + })} + /> + + {notificationContextHolder} + + + + + ); +}; + +export default DatasetsPage; \ No newline at end of file diff --git a/app/client/src/pages/Datasets/hooks.ts b/app/client/src/pages/Datasets/hooks.ts new file mode 100644 index 00000000..e69de29b diff --git a/app/client/src/pages/Evaluations/EvaluateActions.tsx b/app/client/src/pages/Evaluations/EvaluateActions.tsx new file mode 100644 index 00000000..0aeb1bd4 --- /dev/null +++ b/app/client/src/pages/Evaluations/EvaluateActions.tsx @@ -0,0 +1,124 @@ +import { useState } from "react"; +import { Dropdown, Flex, Space, MenuProps, Typography, Modal, Button } from "antd"; +import MoreVertIcon from '@mui/icons-material/MoreVert'; +import FindInPageIcon from '@mui/icons-material/FindInPage'; +import DeleteIcon from '@mui/icons-material/Delete'; +import { FolderViewOutlined, ThunderboltOutlined } from '@ant-design/icons'; +import { Link } from "react-router-dom"; +import EvaluationDetailModal from "../../components/Evaluations/EvaluationDetails/EvaluationDetailModal"; +import { Evaluation } from "./types"; +import styled from "styled-components"; +import { useDeleteEvaluation } from "../../api/Evaluations/evaluations"; +import { Pages } from "../../types"; +import { getFilesURL } from "../Evaluator/util"; + +const { Text } = Typography; + +interface Props { + evaluation: Evaluation; + refetch: () => void; +} + +const ModalButtonGroup = styled(Flex)` + margin-top: 15px !important; +`; + +const EvaluationActions: React.FC = ({ evaluation, refetch }) => { + const [showModal, setShowModal] = useState(false); + const deleteEvaluationReq = useDeleteEvaluation(); + + async function handleDeleteEvaluationConfirm() { + await deleteEvaluationReq.triggerDelete(evaluation.evaluate_file_name, `file_path=${evaluation.local_export_path}`); + refetch(); + } + + const deleteConfirmWarningModal = (row: Evaluation) => { + return Modal.warning({ + title: 'Remove Evaluation', + closable: true, + content: ( + <> + + {`Are you sure you want to remove this evaluation`} {row.display_name}? + + + ), + icon: undefined, + footer: ( + + + + + ), + maskClosable: true, + width: "20%" + }) + } + + const menuActions: MenuProps['items'] = [ + { + key: '1', + label: ( + + View Evaluation Details + + ), + icon: + }, + { + key: '2', + label: ( + + View in Preview + + ), + icon: , + }, + { + key: '3', + label: ( + + Re-evaluate + + ), + icon: , + }, + { + key: '4', + label: ( + deleteConfirmWarningModal(evaluation)}>Remove Evaluation + ), + icon: + } + ]; + + return ( + <> + + + + + + + + {showModal && + } + + ); +} + +export default EvaluationActions; \ No newline at end of file diff --git a/app/client/src/pages/Evaluations/EvaluationsPage.tsx b/app/client/src/pages/Evaluations/EvaluationsPage.tsx new file mode 100644 index 00000000..68db61cc --- /dev/null +++ b/app/client/src/pages/Evaluations/EvaluationsPage.tsx @@ -0,0 +1,170 @@ +import throttle from "lodash/throttle"; +import { SyntheticEvent, useEffect } from "react"; +import { Badge, Col, Flex, Input, Layout, notification, Row, Table, TableProps } from "antd"; +import styled from "styled-components"; +import Paragraph from 'antd/es/typography/Paragraph'; +import { JOB_EXECUTION_TOTAL_COUNT_THRESHOLD, TRANSLATIONS } from '../../constants'; +import { useEvaluations } from "../Home/hooks"; +import { Evaluation } from "../Home/types"; +import { sortItemsByKey } from "../../utils/sortutils"; +import Loading from "../Evaluator/Loading"; + +import { SearchProps } from "antd/es/input"; +import DateTime from "../../components/DateTime/DateTime"; +import EvaluateActions from "./EvaluateActions"; +import { getColorCode } from "../Evaluator/util"; +import { JobStatus } from "../../types"; +import JobStatusIcon from "../../components/JobStatus/jobStatusIcon"; +import StyledTitle from "../Evaluator/StyledTitle"; + + +const { Content } = Layout; +const { Search } = Input; + +const StyledContent = styled(Content)` + padding: 24px; + background-color: #f5f7f8; +`; + +const Container = styled.div` + background-color: #ffffff; + padding: 1rem; + overflow-x: auto; +`; + +const StyledTable = styled(Table)` + .ant-table-thead > tr > th { + color: #5a656d; + border-bottom: 1px solid #eaebec; + font-weight: 500; + text-align: left; + // background: #ffffff; + border-bottom: 1px solid #eaebec; + transition: background 0.3s ease; + } + .ant-table-row > td.ant-table-cell { + padding: 8px; + padding-left: 16px; + font-size: 14px; + font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; + color: #5a656d; + .ant-typography { + font-size: 13px; + font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; + } + } +`; + +const StyledParagraph = styled(Paragraph)` + font-size: 13px; + font-family: Roboto, -apple-system, 'Segoe UI', sans-serif; + color: #5a656d; +`; + + +const EvaluationsPage: React.FC = () => { + const { data, isLoading, isError, refetch, setSearchQuery, pagination } = useEvaluations(); + + useEffect(() => { + if (isError) { + notification.error({ + message: 'Error', + description: 'An error occurred while fetching evaluations' + }); + } + }, [isError]); + + const onSearch: SearchProps['onSearch'] = (value: unknown) => { + throttle((value: string) => setSearchQuery(value), 500)(value); + } + + const onChange = (event: SyntheticEvent) => { + const value = (event.target as HTMLInputElement)?.value; + throttle((value: string) => setSearchQuery(value), 500)(value); + } + + const columns: TableProps['columns'] = [ + { + key: 'job_status', + title: 'Status', + dataIndex: 'job_status', + width: 80, + sorter: sortItemsByKey('job_status'), + render: (status: JobStatus) => + + + }, + { + key: 'display_name', + title: 'Display Name', + dataIndex: 'display_name', + width: 150, + sorter: sortItemsByKey('display_name'), + }, { + key: 'model_id', + title: 'Model ID', + dataIndex: 'model_id', + width: 150, + sorter: sortItemsByKey('model_id'), + }, { + key: 'average_score', + title: 'Average Score', + dataIndex: 'average_score', + width: 80, + render: (average_score) => , + sorter: sortItemsByKey('average_score'), + },{ + key: 'use_case', + title: 'Use Case', + dataIndex: 'use_case', + width: 180, + sorter: sortItemsByKey('use_case'), + render: (useCase) => {TRANSLATIONS[useCase]} + }, { + key: 'timestamp', + title: 'Create Time', + dataIndex: 'timestamp', + width: 140, + sorter: sortItemsByKey('timestamp'), + render: (timestamp) => + + }, { + key: 'action', + title: 'Actions', + width: 100, + render: (row: Evaluation) => + + + }, + ]; + + + return ( + + + {'Evaluations'} + + + + + + + {isLoading && } + `${row?.display_name}_${row?.evaluate_file_name}`} + tableLayout="fixed" + pagination={pagination} + columns={columns} + dataSource={data?.data || [] as Evaluation[]} + /> + + + + ); +} + +export default EvaluationsPage; \ No newline at end of file diff --git a/app/client/src/pages/Exports/ExportsPage.tsx b/app/client/src/pages/Exports/ExportsPage.tsx new file mode 100644 index 00000000..b82b9a5f --- /dev/null +++ b/app/client/src/pages/Exports/ExportsPage.tsx @@ -0,0 +1,36 @@ +import { useEffect } from "react"; +import { Layout } from "antd"; +import styled from "styled-components"; +import ExportsTab from "../Home/ExportsTab"; +import StyledTitle from "../Evaluator/StyledTitle"; + +const { Content } = Layout; + +const StyledContent = styled(Content)` + padding: 24px; + background-color: #f5f7f8; +`; + +const Container = styled.div` + background-color: #ffffff; + padding: 1rem; + overflow-x: auto; +`; + + + +const ExportsPage: React.FC = () => { + return ( + + + + {'Exports'} + + + + + + ); +} + +export default ExportsPage; \ No newline at end of file diff --git a/app/client/src/pages/Home/DatasetsTab.tsx b/app/client/src/pages/Home/DatasetsTab.tsx index 508db33f..7ce2d040 100644 --- a/app/client/src/pages/Home/DatasetsTab.tsx +++ b/app/client/src/pages/Home/DatasetsTab.tsx @@ -21,6 +21,9 @@ const { Search } = Input; const Container = styled.div` background-color: #ffffff; padding: 1rem; + padding-left: 0; + padding-right: 0; + overflow-x: auto; `; const StyledTable = styled(Table)` @@ -54,7 +57,11 @@ const StyledParagraph = styled(Paragraph)` color: #5a656d; `; -const DatasetsTab: React.FC = () => { +interface Props { + hideSearch?: boolean; +} + +const DatasetsTab: React.FC = ({ hideSearch = false }) => { const { data, isLoading, isError, refetch, setSearchQuery, pagination } = useDatasets(); const [notificationInstance, notificationContextHolder] = notification.useNotification(); const [exportResult, setExportResult] = React.useState(); @@ -109,40 +116,51 @@ const DatasetsTab: React.FC = () => { key: 'display_name', title: 'Display Name', dataIndex: 'display_name', - sorter: sortItemsByKey('display_name'), + width: 140, + sorter: sortItemsByKey('display_name') }, { key: 'generate_file_name', title: 'Dataset Name', dataIndex: 'generate_file_name', - sorter: sortItemsByKey('generate_file_name'), width: 250, + sorter: sortItemsByKey('generate_file_name'), render: (generate_file_name) => {generate_file_name} }, { key: 'model_id', title: 'Model', dataIndex: 'model_id', - sorter: sortItemsByKey('model_id'), width: 250, + sorter: sortItemsByKey('model_id'), render: (modelId) => {modelId} }, { key: 'num_questions', title: 'Questions Per Topic', dataIndex: 'num_questions', + width: 120, align: 'center', - sorter: sortItemsByKey('num_questions'), - width: 120 + sorter: sortItemsByKey('num_questions') }, { key: 'total_count', title: 'Total Count', dataIndex: 'total_count', + width: 80, align: 'center', - sorter: sortItemsByKey('total_count'), - width: 80 - }, { + sorter: sortItemsByKey('total_count') + }, + { + key: 'completed_rows', + title: 'Completed Rows', + dataIndex: 'completed_rows', + width: 80, + align: 'center', + sorter: sortItemsByKey('completed_rows') + }, + { key: 'use_case', title: 'Use Case', dataIndex: 'use_case', + width: 120, sorter: sortItemsByKey('use_case'), render: (useCase) => TRANSLATIONS[useCase] }, { @@ -150,12 +168,13 @@ const DatasetsTab: React.FC = () => { title: 'Creation Time', dataIndex: 'timestamp', defaultSortOrder: 'descend', + width: 120, sorter: sortItemsByKey('timestamp'), render: (timestamp) => <>{timestamp == null ? 'N/A' : } }, { key: '7', title: 'Actions', - width: 150, + width: 100, render: (row: Dataset) => ( ) @@ -164,7 +183,7 @@ const DatasetsTab: React.FC = () => { return ( - + {!hideSearch && { onChange={onChange} style={{ width: 350 }} /> - + } {isLoading && } `${row?.display_name}_${row?.generate_file_name}`} tableLayout="fixed" - pagination={pagination} + // pagination={pagination} columns={columns} dataSource={data?.data || [] as Dataset[]} onRow={(row: Dataset) => diff --git a/app/client/src/pages/Home/EvaluateButton.tsx b/app/client/src/pages/Home/EvaluateButton.tsx deleted file mode 100644 index 83225e82..00000000 --- a/app/client/src/pages/Home/EvaluateButton.tsx +++ /dev/null @@ -1,85 +0,0 @@ -import { Button, Form, Modal, Select } from "antd"; -import { useNavigate } from 'react-router-dom'; -import ArrowRightIcon from '../../assets/ic-arrow-right.svg'; -import { useEffect, useState } from "react"; -import { useDatasets } from "./hooks"; -import Loading from "../Evaluator/Loading"; -import { isEmpty } from "lodash"; -import { Dataset } from "../Evaluator/types"; -import { Pages } from "../../types"; - - -const EvaluateButton: React.FC = () => { - const [form] = Form.useForm(); - const navigate = useNavigate(); - const [showModal, setShowModal] = useState(false); - const [datasets, setDatasets] = useState([]); - const {data, isLoading} = useDatasets(); - - useEffect(() => { - if(!isEmpty(data?.data)) { - setDatasets(data?.data); - } - }, [data]); - - const initialValues = { - dataset_name: null - } - - const onClose = () => setShowModal(false); - - const onSubmit = async () => { - try { - await form.validateFields(); - const values = form.getFieldsValue(); - const dataset = datasets.find((dataset: Dataset) => dataset.display_name === values.dataset_name); - navigate(`/${Pages.EVALUATOR}/create/${dataset?.generate_file_name}`); - } catch (e) { - console.error(e); - } - } - - const options = datasets.map((dataset: unknown) => ({ - value: dataset.display_name, - label: dataset.display_name, - key: `${dataset?.display_name}-${dataset?.generate_file_name}` - })); - - return ( - <> - - {showModal && - - {isLoading && } -
- - - + +
+
+ )} + + ); +} + +export default EvaluateSection; \ No newline at end of file diff --git a/app/client/src/pages/Home/EvaluationsTab.tsx b/app/client/src/pages/Home/EvaluationsTab.tsx index 3ee69c8d..bcd3711a 100644 --- a/app/client/src/pages/Home/EvaluationsTab.tsx +++ b/app/client/src/pages/Home/EvaluationsTab.tsx @@ -21,6 +21,9 @@ const { Search } = Input; const Container = styled.div` background-color: #ffffff; padding: 1rem; + padding-left: 0; + padding-right: 0; + overflow-x: auto; `; const StyledTable = styled(Table)` @@ -52,8 +55,12 @@ const StyledParagraph = styled(Paragraph)` color: #5a656d; `; +interface Props { + hideSearch?: boolean; +} + -const EvaluationsTab: React.FC = () => { +const EvaluationsTab: React.FC = ({ hideSearch }) => { const { data, isLoading, isError, refetch, setSearchQuery, pagination } = useEvaluations(); useEffect(() => { @@ -89,34 +96,40 @@ const EvaluationsTab: React.FC = () => { key: 'display_name', title: 'Display Name', dataIndex: 'display_name', + width: 150, sorter: sortItemsByKey('display_name'), }, { key: 'model_id', title: 'Model ID', dataIndex: 'model_id', + width: 150, sorter: sortItemsByKey('model_id'), }, { key: 'average_score', title: 'Average Score', dataIndex: 'average_score', + width: 80, render: (average_score) => , sorter: sortItemsByKey('average_score'), },{ key: 'use_case', title: 'Use Case', dataIndex: 'use_case', + width: 180, sorter: sortItemsByKey('use_case'), render: (useCase) => {TRANSLATIONS[useCase]} }, { key: 'timestamp', title: 'Create Time', dataIndex: 'timestamp', + width: 140, sorter: sortItemsByKey('timestamp'), render: (timestamp) => }, { key: 'action', title: 'Actions', + width: 100, render: (row: Evaluation) => @@ -125,7 +138,7 @@ const EvaluationsTab: React.FC = () => { return ( - + {!hideSearch && { onChange={onChange} style={{ width: 350 }} /> - + } {isLoading && } `${row?.display_name}_${row?.evaluate_file_name}`} tableLayout="fixed" - pagination={pagination} + // pagination={pagination} columns={columns} dataSource={data?.data || [] as Evaluation[]} /> diff --git a/app/client/src/pages/Home/ExportsTab.tsx b/app/client/src/pages/Home/ExportsTab.tsx index a81f870f..f0c458c8 100644 --- a/app/client/src/pages/Home/ExportsTab.tsx +++ b/app/client/src/pages/Home/ExportsTab.tsx @@ -16,6 +16,8 @@ const { Text, Link, Paragraph } = Typography; const Container = styled.div` background-color: #ffffff; padding: 1rem; + padding-left: 0; + padding-right: 0; `; const StyledTable = styled(Table)` @@ -51,6 +53,8 @@ const StyledParagraph = styled(Paragraph)` export type ExportsTabProps = { refetchOnRender: boolean; + hideSearch?: boolean; + hidePagination?: boolean; }; const columns: TableProps['columns'] = [ @@ -108,7 +112,7 @@ const columns: TableProps['columns'] = [ } ]; -const ExportsTab: React.FC = ({ refetchOnRender }) => { +const ExportsTab: React.FC = ({ refetchOnRender, hideSearch, hidePagination }) => { const [pagination, setPagination] = useState({ page: 1, pageSize: 20 }); const { isLoading, data, refetch } = useGetExportJobs(pagination.page, pagination.pageSize); const [searchTerm, setSearchTerm] = useState(''); @@ -137,7 +141,7 @@ const ExportsTab: React.FC = ({ refetchOnRender }) => { return ( - + {!hideSearch && = ({ refetchOnRender }) => { }} style={{ width: 350 }} /> - + } row.id} columns={columns} tableLayout="fixed" - pagination={{ + pagination={hidePagination ? false : { current: pagination.page, pageSize: pagination.pageSize, total: data?.pagination?.total || 0, diff --git a/app/client/src/pages/Home/HomePage.tsx b/app/client/src/pages/Home/HomePage.tsx index 231ba64d..61385f1c 100644 --- a/app/client/src/pages/Home/HomePage.tsx +++ b/app/client/src/pages/Home/HomePage.tsx @@ -1,14 +1,16 @@ import React, { useState } from 'react'; import styled from 'styled-components'; -import { Button, Col, Flex, Layout, Row, Tabs } from 'antd' +import { Button, Card, Col, Flex, Layout, Row, Tabs } from 'antd' import type { TabsProps } from 'antd'; import DatasetsTab from './DatasetsTab'; import EvaluationsTab from './EvaluationsTab'; -import DatasetIcon from '../../assets/ic-datasets.svg'; -import ArrowRightIcon from '../../assets/ic-arrow-right.svg'; -import EvaluateIcon from '../../assets/ic-evaluations.svg'; -import EvaluateButton from './EvaluateButton'; +import DatasetIcon from '../../assets/ic-brand-alternative-data.svg'; +import DataAugmentationIcon from '../../assets/ic-data-augmentation.svg'; import ExportsTab from './ExportsTab'; +import TemplatesSection from './TemplatesSection'; +import { useNavigate } from 'react-router-dom'; +import EvaluateSection from './EvaluateSection'; +import ArrowRightIcon from '../../assets/ic-arrow-right.svg'; const { Content } = Layout; @@ -18,33 +20,46 @@ const StyledContent = styled(Content)` background-color: #f5f7f8; `; -const HeaderSection = styled.div` +export const HeaderSection = styled.div` display: flex; + flex-direction: column; margin-bottom: 1rem; - height: 100px; + height: 150px; width: 50%; padding: 16px; background-color: #ffffff; + cursor: pointer; + .top-section { + display: flex; + flex-direction: row; + } + .bottom-section { + display: flex; + flex-direction: row; + justify-content: flex-end; + margin-top: 8px; + } .left-section { width: 66px; - height: 66px; + height: 46px; flex-grow: 0; margin: 0 8px 9px 0; padding: 14.4px 14.4px 14.4px 14.4px; - background-color: #e5ffe5; + background-color: #ffffff; } .middle-section { display: flex; flex-direction: column; justify-content: center; margin-left: 8px; + margin-top: 12px; width: 70%; .section-title { width: 186px; height: 24px; flex-grow: 0; font-size: 16px; - font-weight: 500; + font-weight: normal; font-stretch: normal; font-style: normal; line-height: 1.5; @@ -52,13 +67,26 @@ const HeaderSection = styled.div` text-align: left; color: #1b2329; } - } - .right-section { + .section-description { + align-self: stretch; + flex-grow: 1; + font-size: 12px; + font-weight: normal; + font-stretch: normal; + font-style: normal; + line-height: 1.33; + letter-spacing: normal; + text-align: left; + color: #1b2329; + min-height: 50px; + } + } + .right-section { display: flex; flex-direction: column-reverse; } .evaluate-icon { - background-color: #fff4cd; + background-color: #ffffff; } `; @@ -69,23 +97,24 @@ export enum ViewType { } const HomePage: React.FC = () => { + const navigate = useNavigate(); const [tabViewType, setTabViewType] = useState(ViewType.DATASETS); const items: TabsProps['items'] = [ { key: ViewType.DATASETS, label: 'Datasets', - children: , + children: , }, { key: ViewType.EVALUATIONS, label: 'Evaluations', - children: , + children: , }, { key: ViewType.EXPORTS, label: 'Exports', - children: , + children: , } ]; @@ -97,50 +126,69 @@ const HomePage: React.FC = () => { - + navigate('/data-generator')}> +
Datasets
-
Create Datasets
+
Generation
-

Generate synthetic datasets for training models

+ Create synthetic data from scratch using examples, documents, seed instructions and AI assisted prompts.
-
+
+ +
-
+
- -
- Datasets -
-
-
Evaluate
-
-

Evaluate generated datasets for fine tuning LLMs

+ + navigate('/data-augmentation')}> +
+
+ augmentation +
+
+
Augmentation
+
+ Add synthetic rows or field to existing data to fill gaps or balance datasets such as language translations. +
-
+ +
- +
+ - + + + + + + + + ); diff --git a/app/client/src/pages/Home/TemplateCard.tsx b/app/client/src/pages/Home/TemplateCard.tsx new file mode 100644 index 00000000..112a68e2 --- /dev/null +++ b/app/client/src/pages/Home/TemplateCard.tsx @@ -0,0 +1,213 @@ +import React from 'react'; +import styled from "styled-components"; +import { Template } from './types'; +import { Popover, Space, Tag } from 'antd'; +import ArrowRightIcon from '../../assets/ic-arrow-right-light.svg'; +import { Pages } from '../../types'; +import { useNavigate } from 'react-router-dom'; +import sample from 'lodash/sample'; +import { getTemplateTagColors, TemplateTagThemes } from './constans'; + + +interface Props { + template: Template; +} + +const StyledCard = styled.div` + background-color: #ffffff; + display: flex; + flex-direction: column; + overflow-x: auto; + height: 200px; + width: 300px; + align-self: stretch; + flex-grow: 0; + justify-content: flex-start; + align-items: stretch; + gap: 8px; + padding: 16px 24px; + border-radius: 4px; + border: 1px solid #d6d8db; + cursor: pointer; + +`; + +const TopSection = styled.div` + display: flex; + flex-direction: column; + justify-content: center; + align-items: flex-start; + flex: 1; + margin-bottom: 1rem; +`; + +const StyledTitle = styled.div` + height: 24px; + flex-grow: 0; + font-size: 16px; + font-weight: normal; + font-stretch: normal; + font-style: normal; + line-height: 1.5; + letter-spacing: normal; + text-align: left; + color: rgba(0, 0, 0, 0.85); +`; + +const StyledDescription = styled.div` + height: 44px; + align-self: stretch; + flex-grow: 0; + font-size: 14px; + font-weight: normal; + font-stretch: normal; + font-style: normal; + line-height: 1.57; + letter-spacing: normal; + text-align: left; + color: rgba(0, 0, 0, 0.45); + overflow: hidden; + text-overflow: ellipsis; + display: -webkit-box; + -webkit-line-clamp: 2; + -webkit-box-orient: vertical; + overflow: hidden; + display: -webkit-box; + -webkit-line-clamp: 2; /* Number of lines to show */ + -webkit-box-orient: vertical; + text-overflow: ellipsis; +`; + + + +const BottomSection = styled.div` + flex: 1; + display: flex; + align-items: center; + height: 32px; + flex-grow: 1; + display: flex; + flex-direction: row; + justify-content: space-between; + align-items: center; + padding: 0; + .text { + width: 78px; + height: 24px; + flex-grow: 0; + font-size: 14px; + font-weight: normal; + font-stretch: normal; + font-style: normal; + line-height: 1.57; + letter-spacing: normal; + text-align: left; + color: rgba(0, 0, 0, 0.88); + } + .icon { + width: 24px; + height: 24px; + flex-grow: 0; + display: flex; + justify-content: center; + align-items: center; + border-radius: 50%; + cursor: pointer; + color: #000000e1; + } +`; + +const TagsContainer = styled.div` + min-height: 30px; + display: block; + margin-bottom: 4px; + margin-top: 4px; + .ant-tag { + max-width: 150px; + } + .tag-title { + overflow: hidden; + white-space: nowrap; + text-overflow: ellipsis; + } +`; + +const StyledTag = styled(Tag)` + color: ${props => props.theme.color}; + background-color: ${props => props.theme.backgroundColor}; + border: 1px solid ${props => props.theme.borderColor}; +`; + + +const TemplateCard: React.FC = ({ template }) => { + const navigate = useNavigate(); + const hasTags = template.tag !== null && Array.isArray(template.tag); + const tags = !hasTags ? [] : template.tag.slice(0, 1); + const moreTags = !hasTags ? [] : template.tag.slice(1); + + const getTag = (tag: string) => { + const theme = sample(TemplateTagThemes); + const { color, backgroundColor, borderColor } = getTemplateTagColors(theme as string); + + return ( + +
+ {tag} +
+
+ ) + } + + + return ( + navigate(`/${Pages.GENERATOR}/${template.id}`)}> + + {template.name} + + {template.description} + + + + + {tags.map((tag: string) => ( + getTag(tag) + ))} + {moreTags.length > 0 && ( + + {moreTags.map((tag: string) => ( + getTag(tag) + ))} + + } + trigger="hover" + > + +
{`+${moreTags.length}`}
+
+ + )} + +
+ + Get Started +
+ Get Started +
+
+
+ ) +} + +export default TemplateCard; \ No newline at end of file diff --git a/app/client/src/pages/Home/TemplatesSection.tsx b/app/client/src/pages/Home/TemplatesSection.tsx new file mode 100644 index 00000000..a27b8cd4 --- /dev/null +++ b/app/client/src/pages/Home/TemplatesSection.tsx @@ -0,0 +1,56 @@ +import get from 'lodash/get'; +import { Card } from 'antd'; +import React from 'react'; +import styled from "styled-components"; +import { useGetUseCases } from '../DataGenerator/hooks'; +import Loading from '../Evaluator/Loading'; +import { Pages } from "../../types"; +import { Template } from './types'; +import TemplateCard from './TemplateCard'; + + +const Container = styled.div` + background-color: #ffffff; + padding: 1rem; + overflow-x: auto; +`; + +const StyledContainer = styled.div` + display: flex; + flex-wrap: wrap; + gap: 16px; + padding: 16px; + width: 100%; + // justify-content: center; + // align-items: center; +`; + + +const TemplatesSection: React.FC = () => { + const useCasesReq = useGetUseCases(); + if (useCasesReq.isLoading) { + return ; + } + + const useCases: Template[] = get(useCasesReq, 'data.usecases', []); + + + return ( + + {useCasesReq.isLoading && } + {useCasesReq.isError &&
Error loading templates
} + + {useCases.map((useCase: Template) => ( + ) + )} + + + +
+ ) +} + +export default TemplatesSection; \ No newline at end of file diff --git a/app/client/src/pages/Home/WelcomePage.tsx b/app/client/src/pages/Home/WelcomePage.tsx index 82498ab2..135f70dc 100644 --- a/app/client/src/pages/Home/WelcomePage.tsx +++ b/app/client/src/pages/Home/WelcomePage.tsx @@ -1,10 +1,12 @@ -import { Button, Col, Flex, Layout, Row, Image } from 'antd'; +import toString from 'lodash/toString'; +import { Button, Col, Flex, Layout, Row, Image, Checkbox } from 'antd'; import React from 'react'; import styled from 'styled-components'; import SDGIcon from '../../assets/sdg-landing.svg'; import LightBulbIcon from '../../assets/ic-lightbulb.svg'; import QueryPromptIcon from '../../assets/ic-query-prompt.svg'; import NumbersIcon from '../../assets/ic-numbers.svg'; +import { CheckboxChangeEvent } from 'antd/es/checkbox'; const { Content } = Layout; @@ -107,6 +109,11 @@ const InfoSection = styled.div` const WelcomePage: React.FC = () => { + const onChange = (e: CheckboxChangeEvent) => { + const checked = e.target.checked; + window.localStorage.setItem('sds_mute_welcome_page', toString(checked)); + } + return ( @@ -148,6 +155,10 @@ const WelcomePage: React.FC = () => {
+ +
+ {`Don't show me this again`} +
diff --git a/app/client/src/pages/Home/constans.ts b/app/client/src/pages/Home/constans.ts new file mode 100644 index 00000000..030d19c1 --- /dev/null +++ b/app/client/src/pages/Home/constans.ts @@ -0,0 +1,43 @@ + +export const TemplateTagThemes = [ 'green', 'blue', 'yellow' ]; + +export enum TemplateColors { + DARK_GREEN = '#0a5f0a', + LIGHT_GREEN = '#e5ffe5', + BORDER_GREEN = '#acfbac', + DARK_BLUE = '#004379', + LIGHT_BLUE = '#edf7ff', + BORDER_BLUE = '#90ceff', + LIGHT_YELLOW = '#fff4cd', + DARK_YELLOW = '#6e5600', + BORDER_YELLOW = '#fce079' +} + +export const getTemplateTagColors = (theme: string) => { + switch (theme) { + case 'green': + return { + color: TemplateColors.DARK_GREEN, + backgroundColor: TemplateColors.LIGHT_GREEN, + borderColor: TemplateColors.BORDER_GREEN + }; + case 'blue': + return { + color: TemplateColors.DARK_BLUE, + backgroundColor: TemplateColors.LIGHT_BLUE, + borderColor: TemplateColors.BORDER_BLUE + }; + case 'yellow': + return { + color: TemplateColors.DARK_YELLOW, + backgroundColor: TemplateColors.LIGHT_YELLOW, + borderColor: TemplateColors.BORDER_YELLOW + }; + default: + return { + color: TemplateColors.DARK_GREEN, + backgroundColor: TemplateColors.LIGHT_GREEN, + borderColor: TemplateColors.BORDER_GREEN + }; + } +}; \ No newline at end of file diff --git a/app/client/src/pages/Home/hooks.ts b/app/client/src/pages/Home/hooks.ts index 5b2b63d2..a0061dc9 100644 --- a/app/client/src/pages/Home/hooks.ts +++ b/app/client/src/pages/Home/hooks.ts @@ -35,7 +35,7 @@ const fetchExports = async (page = 1, pageSize = 10) => { export const useDatasets = () => { const [searchQuery, setSearchQuery] = useState(null); - const [pagination, setPagination] = useState({ page: 1, pageSize: 10 }); + const [pagination, setPagination] = useState({ page: 1, pageSize: 5 }); const { data, isLoading, isError, refetch } = useQuery>( { @@ -65,6 +65,9 @@ export const useDatasets = () => { data: filtered, }; } + if (filteredData && filteredData.data.length !== 0 && filteredData.data.length > 5) { + filteredData.data = filteredData.data.slice(0, 5); + } return { data: filteredData, @@ -116,6 +119,10 @@ export const useEvaluations = () => { data: filtered, }; } + + if (filteredData && filteredData.data.length !== 0 && filteredData.data.length > 5) { + filteredData.data = filteredData.data.slice(0, 5); + } return { data: filteredData, @@ -170,6 +177,10 @@ export const useExports = () => { }; } + + if (filteredData && filteredData.data.length !== 0 && filteredData.data.length > 5) { + filteredData.data = filteredData.data.slice(0, 5); + } return { data: filteredData, @@ -242,4 +253,4 @@ export const useUpgradeSynthesisStudio = () => { isError: mutation.isError, data: mutation.data }; -} \ No newline at end of file +} diff --git a/app/client/src/pages/Home/types.ts b/app/client/src/pages/Home/types.ts index d5c31b2b..62c5d18f 100644 --- a/app/client/src/pages/Home/types.ts +++ b/app/client/src/pages/Home/types.ts @@ -31,4 +31,11 @@ export interface DatasetDetails { export interface DatasetGeneration { [key: string]: string; +} + +export interface Template { + id: string; + name: string; + tag: string[]; + description: string; } \ No newline at end of file diff --git a/app/client/src/routes.tsx b/app/client/src/routes.tsx index 853257c5..9a0e4cc2 100644 --- a/app/client/src/routes.tsx +++ b/app/client/src/routes.tsx @@ -2,16 +2,23 @@ import { Navigate, createBrowserRouter } from "react-router-dom"; import Layout from "./Container"; import DataGenerator from "./pages/DataGenerator"; import HomePage from "./pages/Home"; -import { Pages } from "./types"; +import { Pages, WizardModeType } from "./types"; import EvaluatorPage from "./pages/Evaluator"; import ReevaluatorPage from "./pages/Evaluator/ReevaluatorPage"; import DatasetDetailsPage from "./pages/DatasetDetails/DatasetDetailsPage"; import WelcomePage from "./pages/Home/WelcomePage"; import ErrorPage from "./pages/ErrorPage"; import EvaluationDetailsPage from "./pages/EvaluationDetails/EvaluationDetailsPage"; +import DatasetsPage from "./pages/Datasets/DatasetsPage"; +import EvaluationsPage from "./pages/Evaluations/EvaluationsPage"; +import ExportsPage from "./pages/Exports/ExportsPage"; //import TelemetryDashboard from "./components/TelemetryDashboard"; +const isWelcomePageMuted = () => { + return window.localStorage.getItem('sds_mute_welcome_page') === 'true'; +} + const router = createBrowserRouter([ { path: '/', @@ -19,7 +26,9 @@ const router = createBrowserRouter([ children: [ { path: '/', // Redirect root to Pages.WELCOME - element: , + element: isWelcomePageMuted() ? : + , + errorElement: }, { path: Pages.HOME, @@ -29,10 +38,40 @@ const router = createBrowserRouter([ }, { path: Pages.GENERATOR, + element: , + errorElement: , + loader: async () => null + }, + { + path: `${Pages.GENERATOR}/:template_name`, element: , errorElement: , loader: async () => null }, + { + path: Pages.DATA_AUGMENTATION, + element: , + errorElement: , + loader: async () => null + }, + { + path: Pages.DATASETS, + element: , + errorElement: , + loader: async () => null + }, + { + path: Pages.EVALUATIONS, + element: , + errorElement: , + loader: async () => null + }, + { + path: Pages.EXPORTS, + element: , + errorElement: , + loader: async () => null + }, { path: `${Pages.REGENERATE}/:generate_file_name`, element: , diff --git a/app/client/src/types.ts b/app/client/src/types.ts index ee3381bf..900b81ab 100644 --- a/app/client/src/types.ts +++ b/app/client/src/types.ts @@ -1,10 +1,13 @@ export enum Pages { GENERATOR = 'data-generator', + DATA_AUGMENTATION = 'data-augmentation', REGENERATE = 're-generate', EVALUATOR = 'evaluator', HISTORY = 'history', HOME = 'home', DATASETS = 'datasets', + EVALUATIONS = 'evaluations', + EXPORTS = 'exports', WELCOME = 'welcome', FEEDBACK = 'feedback', UPGRADE = 'upgrade' @@ -45,4 +48,16 @@ export const EXPORT_TYPE_LABELS: Record = { export type JobStatus = 'ENGINE_STOPPED' | 'ENGINE_SUCCEEDED' | 'ENGINE_TIMEDOUT' | 'ENGINE_SCHEDULING' | 'ENGINE_RUNNING' | 'null' | 'default'; -export const HuggingFaceIconUrl = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg"; \ No newline at end of file +export const HuggingFaceIconUrl = "https://huggingface.co/front/assets/huggingface_logo-noborder.svg"; + +export interface UseCase { + name: string; + id: string; + label: string; + value: string; +} + +export enum WizardModeType { + DATA_GENERATION = 'data-generation', + DATA_AUGMENTATION = 'data-augmentation' +} \ No newline at end of file diff --git a/app/core/config.py b/app/core/config.py index 777a6e47..25025fe0 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -17,6 +17,7 @@ class UseCase(str, Enum): LENDING_DATA = "lending_data" #HOUSING_DATA = "housing_data" CREDIT_CARD_DATA = "credit_card_data" + TICKETING_DATASET = "ticketing_dataset" class Technique(str, Enum): SFT = "sft" @@ -87,7 +88,8 @@ class UseCaseMetadataEval(BaseModel): USE_CASE_CONFIGS = { UseCase.CODE_GENERATION: UseCaseMetadata( name="Code Generation", - description="Generate programming questions and solutions with code examples", + description="""Generates paired programming questions and fully-worked answers that include runnable, + well-formatted code plus concise explanations. Ideal for building Q-and-A datasets across programming languages like Python.""", topics=["Python Basics", "Data Manipulation", "Web Development", "Machine Learning", "Algorithms"], default_examples=[ { @@ -114,7 +116,9 @@ class UseCaseMetadataEval(BaseModel): UseCase.TEXT2SQL: UseCaseMetadata( name="Text to SQL", - description="Generate natural language to SQL query pairs", + description="""Creates natural-language questions matched to clean, + executable SQL queries (with optional brief clarifications) spanning basics, joins, aggregates, + subqueries, and window functions. Ensures each pair is consistently formatted for training or evaluation.""", topics=[ "Basic Queries", "Joins", @@ -147,7 +151,7 @@ class UseCaseMetadataEval(BaseModel): UseCase.CUSTOM: UseCaseMetadata( name="Custom", - description="Custom use case for user-defined data generation", + description="""A blank template meant for any user-defined synthetic data task.""", topics=[], default_examples=[], prompt = " ", @@ -156,7 +160,10 @@ class UseCaseMetadataEval(BaseModel): UseCase.LENDING_DATA: UseCaseMetadata( name="Lending Data", - description="Generate synthetic lending data", + description="""Produces realistic LendingClub-style loan records—complete borrower, + loan, and credit-profile fields—while respecting privacy + and intricate cross-field logic (grades, DTI, employment, etc.). + Useful for credit-risk modeling or analytics demos.""", topics=['Business loans', 'Personal loans', 'Auto loans', 'Home equity loans', "Asset-backed loans"], default_examples=[ { @@ -318,205 +325,63 @@ class UseCaseMetadataEval(BaseModel): UseCase.CREDIT_CARD_DATA: UseCaseMetadata( - name="Credit Card Data", - description="Synthetic data for credit card profile data", - topics=[ - "High income person", - "Low income person", - "Four-person family", - "Three-person family", - "Two-person family", - "Five-person family", - "more than 10 credit records", - "more than 20 credit records" - - ], - default_examples=[ - { - "ID": 100001, - "CODE_GENDER": "M", - "FLAG_OWN_CAR": "Y", - "FLAG_OWN_REALTY": "Y", - "CNT_CHILDREN": 2, - "AMT_INCOME_TOTAL": 85000, - "NAME_INCOME_TYPE": "Commercial associate", - "NAME_EDUCATION_TYPE": "Higher education", - "NAME_FAMILY_STATUS": "Married", - "NAME_HOUSING_TYPE": "House / apartment", - "DAYS_BIRTH": -12775, - "DAYS_EMPLOYED": -2890, - "FLAG_MOBIL": "Y", - "FLAG_WORK_PHONE": "Y", - "FLAG_PHONE": "Y", - "FLAG_EMAIL": "Y", - "OCCUPATION_TYPE": "Manager", - "CNT_FAM_MEMBERS": 4, - "CREDIT_RECORDS": [ - {"ID": 100001, "MONTHS_BALANCE": -24, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -23, "STATUS": "0"}, - {"ID": 100001, "MONTHS_BALANCE": -22, "STATUS": "1"}, - {"ID": 100001, "MONTHS_BALANCE": -21, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -20, "STATUS": "0"}, - {"ID": 100001, "MONTHS_BALANCE": -19, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -18, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -17, "STATUS": "0"}, - {"ID": 100001, "MONTHS_BALANCE": -16, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -15, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -14, "STATUS": "0"}, - {"ID": 100001, "MONTHS_BALANCE": -13, "STATUS": "1"}, - {"ID": 100001, "MONTHS_BALANCE": -12, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -11, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -10, "STATUS": "0"}, - {"ID": 100001, "MONTHS_BALANCE": -9, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -8, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -7, "STATUS": "0"}, - {"ID": 100001, "MONTHS_BALANCE": -6, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -5, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": -4, "STATUS": "0"}, - {"ID": 100001, "MONTHS_BALANCE": -3, "STATUS": "0"}, - {"ID": 100001, "MONTHS_BALANCE": -2, "STATUS": "1"}, - {"ID": 100001, "MONTHS_BALANCE": -1, "STATUS": "C"}, - {"ID": 100001, "MONTHS_BALANCE": 0, "STATUS": "C"} - ] - }, - { - "ID": 100002, - "CODE_GENDER": "F", - "FLAG_OWN_CAR": "N", - "FLAG_OWN_REALTY": "N", - "CNT_CHILDREN": 0, - "AMT_INCOME_TOTAL": 42000, - "NAME_INCOME_TYPE": "Working", - "NAME_EDUCATION_TYPE": "Secondary / secondary special", - "NAME_FAMILY_STATUS": "Single / not married", - "NAME_HOUSING_TYPE": "Rented apartment", - "DAYS_BIRTH": -9850, - "DAYS_EMPLOYED": -1825, - "FLAG_MOBIL": "Y", - "FLAG_WORK_PHONE": "N", - "FLAG_PHONE": "Y", - "FLAG_EMAIL": "Y", - "OCCUPATION_TYPE": "Sales staff", - "CNT_FAM_MEMBERS": 1, - "CREDIT_RECORDS": [ - {"ID": 100002, "MONTHS_BALANCE": -18, "STATUS": "X"}, - {"ID": 100002, "MONTHS_BALANCE": -17, "STATUS": "X"}, - {"ID": 100002, "MONTHS_BALANCE": -16, "STATUS": "0"}, - {"ID": 100002, "MONTHS_BALANCE": -15, "STATUS": "1"}, - {"ID": 100002, "MONTHS_BALANCE": -14, "STATUS": "2"}, - {"ID": 100002, "MONTHS_BALANCE": -13, "STATUS": "3"}, - {"ID": 100002, "MONTHS_BALANCE": -12, "STATUS": "C"}, - {"ID": 100002, "MONTHS_BALANCE": -11, "STATUS": "0"}, - {"ID": 100002, "MONTHS_BALANCE": -10, "STATUS": "C"}, - {"ID": 100002, "MONTHS_BALANCE": -9, "STATUS": "0"}, - {"ID": 100002, "MONTHS_BALANCE": -8, "STATUS": "1"}, - {"ID": 100002, "MONTHS_BALANCE": -7, "STATUS": "C"}, - {"ID": 100002, "MONTHS_BALANCE": -6, "STATUS": "0"}, - {"ID": 100002, "MONTHS_BALANCE": -5, "STATUS": "C"}, - {"ID": 100002, "MONTHS_BALANCE": -4, "STATUS": "0"}, - {"ID": 100002, "MONTHS_BALANCE": -3, "STATUS": "0"}, - {"ID": 100002, "MONTHS_BALANCE": -2, "STATUS": "1"}, - {"ID": 100002, "MONTHS_BALANCE": -1, "STATUS": "2"}, - {"ID": 100002, "MONTHS_BALANCE": 0, "STATUS": "C"} - ] - }, - { - "ID": 100003, - "CODE_GENDER": "M", - "FLAG_OWN_CAR": "Y", - "FLAG_OWN_REALTY": "Y", - "CNT_CHILDREN": 1, - "AMT_INCOME_TOTAL": 95000, - "NAME_INCOME_TYPE": "State servant", - "NAME_EDUCATION_TYPE": "Higher education", - "NAME_FAMILY_STATUS": "Married", - "NAME_HOUSING_TYPE": "House / apartment", - "DAYS_BIRTH": -15330, - "DAYS_EMPLOYED": -4380, - "FLAG_MOBIL": "Y", - "FLAG_WORK_PHONE": "Y", - "FLAG_PHONE": "Y", - "FLAG_EMAIL": "Y", - "OCCUPATION_TYPE": "Core staff", - "CNT_FAM_MEMBERS": 3, - "CREDIT_RECORDS": [ - {"ID": 100003, "MONTHS_BALANCE": -36, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -35, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -34, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -33, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -32, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -31, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -30, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -29, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -28, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -27, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -26, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -25, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -24, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -23, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -22, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -21, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -20, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -19, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -18, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -17, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -16, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -15, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -14, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -13, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -12, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -11, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -10, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -9, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -8, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -7, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -6, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -5, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -4, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -3, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -2, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": -1, "STATUS": "C"}, - {"ID": 100003, "MONTHS_BALANCE": 0, "STATUS": "C"} - ] - }, - { - "ID": 100004, - "CODE_GENDER": "F", - "FLAG_OWN_CAR": "N", - "FLAG_OWN_REALTY": "N", - "CNT_CHILDREN": 3, - "AMT_INCOME_TOTAL": 28000, - "NAME_INCOME_TYPE": "Pensioner", - "NAME_EDUCATION_TYPE": "Secondary / secondary special", - "NAME_FAMILY_STATUS": "Widow/Widower", - "NAME_HOUSING_TYPE": "Rented apartment", - "DAYS_BIRTH": -23725, - "DAYS_EMPLOYED": 365, - "FLAG_MOBIL": "Y", - "FLAG_WORK_PHONE": "N", - "FLAG_PHONE": "N", - "FLAG_EMAIL": "N", - "OCCUPATION_TYPE": "Pensioner", - "CNT_FAM_MEMBERS": 4, - "CREDIT_RECORDS": [ - {"ID": 100004, "MONTHS_BALANCE": -12, "STATUS": "0"}, - {"ID": 100004, "MONTHS_BALANCE": -11, "STATUS": "1"}, - {"ID": 100004, "MONTHS_BALANCE": -10, "STATUS": "2"}, - {"ID": 100004, "MONTHS_BALANCE": -9, "STATUS": "3"}, - {"ID": 100004, "MONTHS_BALANCE": -8, "STATUS": "4"}, - {"ID": 100004, "MONTHS_BALANCE": -7, "STATUS": "5"}, - {"ID": 100004, "MONTHS_BALANCE": -6, "STATUS": "5"}, - {"ID": 100004, "MONTHS_BALANCE": -5, "STATUS": "5"}, - {"ID": 100004, "MONTHS_BALANCE": -4, "STATUS": "5"}, - {"ID": 100004, "MONTHS_BALANCE": -3, "STATUS": "5"}, - {"ID": 100004, "MONTHS_BALANCE": -2, "STATUS": "5"}, - {"ID": 100004, "MONTHS_BALANCE": -1, "STATUS": "5"}, - {"ID": 100004, "MONTHS_BALANCE": 0, "STATUS": "X"} - ] - } -], - prompt= """ - + name="Credit Card Data", + description="""Synthesizes comprehensive user profiles plus chronological credit-status histories, + maintaining ID consistency and plausibly evolving payment behavior. + Designed for training credit-scoring models without exposing real customer information.""", + topics=[ + "High income person", + "Low income person", + "Four-person family", + "Three-person family", + "Two-person family", + "Five-person family", + "more than 10 credit records", + "more than 20 credit records" + ], + default_examples=[ + { + "ID": 100001, "CODE_GENDER": "M", "FLAG_OWN_CAR": "Y", "FLAG_OWN_REALTY": "Y", + "CNT_CHILDREN": 2, "AMT_INCOME_TOTAL": 85000, "NAME_INCOME_TYPE": "Commercial associate", + "NAME_EDUCATION_TYPE": "Higher education", "NAME_FAMILY_STATUS": "Married", + "NAME_HOUSING_TYPE": "House / apartment", "DAYS_BIRTH": -12775, "DAYS_EMPLOYED": -2890, + "FLAG_MOBIL": "Y", "FLAG_WORK_PHONE": "Y", "FLAG_PHONE": "Y", "FLAG_EMAIL": "Y", + "OCCUPATION_TYPE": "Manager", "CNT_FAM_MEMBERS": 4, "MONTHS_BALANCE": -24, "STATUS": "C" + }, + { + "ID": 100001, "CODE_GENDER": "M", "FLAG_OWN_CAR": "Y", "FLAG_OWN_REALTY": "Y", + "CNT_CHILDREN": 2, "AMT_INCOME_TOTAL": 85000, "NAME_INCOME_TYPE": "Commercial associate", + "NAME_EDUCATION_TYPE": "Higher education", "NAME_FAMILY_STATUS": "Married", + "NAME_HOUSING_TYPE": "House / apartment", "DAYS_BIRTH": -12775, "DAYS_EMPLOYED": -2890, + "FLAG_MOBIL": "Y", "FLAG_WORK_PHONE": "Y", "FLAG_PHONE": "Y", "FLAG_EMAIL": "Y", + "OCCUPATION_TYPE": "Manager", "CNT_FAM_MEMBERS": 4, "MONTHS_BALANCE": -23, "STATUS": "0" + }, + { + "ID": 100002, "CODE_GENDER": "F", "FLAG_OWN_CAR": "N", "FLAG_OWN_REALTY": "N", + "CNT_CHILDREN": 0, "AMT_INCOME_TOTAL": 42000, "NAME_INCOME_TYPE": "Working", + "NAME_EDUCATION_TYPE": "Secondary / secondary special", "NAME_FAMILY_STATUS": "Single / not married", + "NAME_HOUSING_TYPE": "Rented apartment", "DAYS_BIRTH": -9850, "DAYS_EMPLOYED": -1825, + "FLAG_MOBIL": "Y", "FLAG_WORK_PHONE": "N", "FLAG_PHONE": "Y", "FLAG_EMAIL": "Y", + "OCCUPATION_TYPE": "Sales staff", "CNT_FAM_MEMBERS": 1, "MONTHS_BALANCE": -18, "STATUS": "X" + }, + { + "ID": 100002, "CODE_GENDER": "F", "FLAG_OWN_CAR": "N", "FLAG_OWN_REALTY": "N", + "CNT_CHILDREN": 0, "AMT_INCOME_TOTAL": 42000, "NAME_INCOME_TYPE": "Working", + "NAME_EDUCATION_TYPE": "Secondary / secondary special", "NAME_FAMILY_STATUS": "Single / not married", + "NAME_HOUSING_TYPE": "Rented apartment", "DAYS_BIRTH": -9850, "DAYS_EMPLOYED": -1825, + "FLAG_MOBIL": "Y", "FLAG_WORK_PHONE": "N", "FLAG_PHONE": "Y", "FLAG_EMAIL": "Y", + "OCCUPATION_TYPE": "Sales staff", "CNT_FAM_MEMBERS": 1, "MONTHS_BALANCE": -17, "STATUS": "X" + }, + { + "ID": 100002, "CODE_GENDER": "F", "FLAG_OWN_CAR": "N", "FLAG_OWN_REALTY": "N", + "CNT_CHILDREN": 0, "AMT_INCOME_TOTAL": 42000, "NAME_INCOME_TYPE": "Working", + "NAME_EDUCATION_TYPE": "Secondary / secondary special", "NAME_FAMILY_STATUS": "Single / not married", + "NAME_HOUSING_TYPE": "Rented apartment", "DAYS_BIRTH": -9850, "DAYS_EMPLOYED": -1825, + "FLAG_MOBIL": "Y", "FLAG_WORK_PHONE": "N", "FLAG_PHONE": "Y", "FLAG_EMAIL": "Y", + "OCCUPATION_TYPE": "Sales staff", "CNT_FAM_MEMBERS": 1, "MONTHS_BALANCE": -16, "STATUS": "0" + } + ], + prompt=""" Generate synthetic data for a credit card dataset. Here is the context about the dataset: Credit score cards are a common risk control method in the financial industry. It uses personal information and data submitted by credit card applicants to predict the probability of future defaults and credit card borrowings. The bank is able to decide whether to issue a credit card to the applicant. Credit scores can objectively quantify the magnitude of risk. @@ -524,80 +389,103 @@ class UseCaseMetadataEval(BaseModel): At present, with the development of machine learning algorithms. More predictive methods such as Boosting, Random Forest, and Support Vector Machines have been introduced into credit card scoring. However, these methods often do not have good transparency. It may be difficult to provide customers and regulators with a reason for rejection or acceptance. - The dataset consists of two tables: `User Records` and `Credit Records`, merged by `ID`. The output must create field values with the following specifications: + The dataset is a single flat table where each row represents one month of a user's credit history. User information is repeated for each monthly record. The output must create field values with the following specifications: - User Records Fields (static per user): - - ID: Unique client number (e.g., 100001, 100002). + Fields: + - ID: Unique client number (e.g., 100001, 100002). This ID will be the same across multiple rows for the same user. - CODE_GENDER: Gender ('F' or 'M'). - FLAG_OWN_CAR: Car ownership ('Y' or 'N'). - FLAG_OWN_REALTY: Property ownership ('Y' or 'N'). - - CNT_CHILDREN`: Number of children (0 or more). - - AMT_INCOME_TOTAL`: Annual income. - - NAME_INCOME_TYPE`: Income category (e.g., 'Commercial associate', 'State servant'). - - NAME_EDUCATION_TYPE`: Education level (e.g., 'Higher education', 'Secondary'). - - NAME_FAMILY_STATUS`: Marital status (e.g., 'Married', 'Single'). - - NAME_HOUSING_TYPE`: Way of living. - - DAYS_BIRTH`: Birthday Count backwards from current day (0), -1 means yesterday. - - DAYS_EMPLOYED: Start date of employment Count backwards from current day(0). If positive, it means the person currently unemployed. (negative for employed; positive for unemployed). - - FLAG_MOBIL: Is there a mobile phone ('Y'/'N') - - FLAG_WORK_PHONE: Is there a work phone ('Y'/'N') - - FLAG_PHONE: Is there a phone ('Y'/'N') - - FLAG_EMAIL: Is there an email ('Y'/'N') + - CNT_CHILDREN: Number of children (0 or more). + - AMT_INCOME_TOTAL: Annual income. + - NAME_INCOME_TYPE: Income category (e.g., 'Commercial associate', 'State servant'). + - NAME_EDUCATION_TYPE: Education level (e.g., 'Higher education', 'Secondary'). + - NAME_FAMILY_STATUS: Marital status (e.g., 'Married', 'Single'). + - NAME_HOUSING_TYPE: Way of living. + - DAYS_BIRTH: Birthday, counted backwards from the current day (0), where -1 means yesterday. + - DAYS_EMPLOYED: Start date of employment, counted backwards from the current day(0). A positive value indicates the person is currently unemployed. + - FLAG_MOBIL: Is there a mobile phone ('Y'/'N'). + - FLAG_WORK_PHONE: Is there a work phone ('Y'/'N'). + - FLAG_PHONE: Is there a phone ('Y'/'N'). + - FLAG_EMAIL: Is there an email ('Y'/'N'). - OCCUPATION_TYPE: Occupation (e.g., 'Manager', 'Sales staff'). - CNT_FAM_MEMBERS: Family size (1 or more). - - Credit records Fields (nested array): - - ID: needs to be the same as the User Records Fields ID. - - MONTHS_BALANCE: Refers to Record month. The month of the extracted data is the starting point, backwards, 0 is the current month, -1 is the previous month, and so on. - - STATUS: + - MONTHS_BALANCE: The month of the record, relative to the present. 0 is the current month, -1 is the previous month, and so on. + - STATUS: Must be one of ['0', '1', '2', '3', '4', '5', 'C', 'X']. - Values description: 0: 1-29 days past due 1: 30-59 days past due 2: 60-89 days overdue 3: 90-119 days overdue 4: 120-149 days overdue 5: Overdue or bad debts, write-offs for more than 150 days C: paid off that month X: No loan for the month - + Values description: 0: 1-29 days past due, 1: 30-59 days past due, 2: 60-89 days overdue, 3: 90-119 days overdue, 4: 120-149 days overdue, 5: Overdue or bad debts (150+ days), C: paid off that month, X: No loan for the month. + 3. Requirements: - - Consistency: Ensure `ID` consistency between the application and its nested credit records. - Avoid real personal data (use synthetic values). - - Format output as three separate JSON objects, each with the structure shown in the examples. + - Format output as three separate JSON objects, each with the flat structure shown in the examples. When generating the data, make sure to adhere to the following guidelines: Privacy guidelines: - Avoid real PII. - - Ensure examples are not leaked into the synthetic data + - Ensure examples are not leaked into the synthetic data. - Cross-row entries guidelines (applies to Credit Records): - - Entries must be ordered from oldest (`MONTHS_BALANCE=-60`) to newest (`MONTHS_BALANCE=0`). - - No duplicate `MONTHS_BALANCE` values for a single client. - - The time-series credit record entries need to be logical and consistent when read in the correct sequence. - - Ensure there are no other cross-row Credit Records inconsistencies not listed above. + Data Series Guidelines (for records with the same ID): + - Rows for the same ID must be ordered chronologically by MONTHS_BALANCE (e.g., -60 to 0). + - No duplicate MONTHS_BALANCE values for a single ID. + - The time-series credit record entries should be logical. For instance, delinquencies can progress (e.g., STATUS '0' -> '1' -> '2') as months advance (e.g., MONTHS_BALANCE -2 -> -1 -> 0). + - If the most recent MONTHS_BALANCE is 0, the STATUS should logically be 'X' (no loan) or 'C' (paid off). Formatting guidelines: - - `CNT_CHILDREN`, `AMT_INCOME_TOTAL`, `DAYS_BIRTH`, `DAYS_EMPLOYED`, etc., must be integers. - - `MONTHS_BALANCE` must be an integer 0 or less. - - Ensure no other formatting problems or inconsistencies appear that are not listed above. - - Cross-row entries guidelines (applies to Credit Records): - - Entries must be ordered from oldest (`MONTHS_BALANCE=-60`) to newest (`MONTHS_BALANCE=0`). - - No duplicate `MONTHS_BALANCE` values for a single client. - - If a Recent `MONTHS_BALANCE` is 0 there should be an "X" (no loan) or "C" (paid off). - - The time-series credit record entries need to be logical and consistent when read in the correct sequence. (e.g. delinquencies can appear in progression as "0" → "1" → "2" as months progress from "-2" → "-1" → "0" etc). - - Ensure there are no other Credit Records inconsistencies appear that not listed above. - - - Cross-Column guidelines: - - Check cross-column inconsistencies such as: - If `FLAG_OWN_REALTY="Y"`, `NAME_HOUSING_TYPE` must **not** be "Rented apartment". - If `DAYS_EMPLOYED > 0` (unemployed), `AMT_INCOME_TOTAL` should be lower (e.g., ≤ $50,000). - `OCCUPATION_TYPE` must align with `NAME_INCOME_TYPE` (e.g., "Pensioner" cannot have "Manager" as occupation). - `CNT_FAM_MEMBERS` ≥ `CNT_CHILDREN` + 1 (accounting for at least one parent). - - Ensure there are no other cross-field Credit Records inconsistencies appear that are not listed above. + - CNT_CHILDREN, AMT_INCOME_TOTAL, DAYS_BIRTH, DAYS_EMPLOYED, etc., must be integers. + - MONTHS_BALANCE must be an integer 0 or less. + - Ensure no other formatting problems or inconsistencies appear. + + Cross-Column guidelines: + - Ensure cross-column consistency. For example: + If FLAG_OWN_REALTY is "Y", NAME_HOUSING_TYPE must not be "Rented apartment". + If DAYS_EMPLOYED > 0 (unemployed), AMT_INCOME_TOTAL should be lower (e.g., <= $50,000). + OCCUPATION_TYPE must align with NAME_INCOME_TYPE (e.g., a "Pensioner" should not have "Manager" as an occupation). + CNT_FAM_MEMBERS must be greater than or equal to CNT_CHILDREN + 1 (to account for at least one parent). + - Ensure no other cross-field inconsistencies appear. + """, + schema=None +), + UseCase.TICKETING_DATASET: UseCaseMetadata( + name="Ticketing Dataset", + description= """Generates polite, professional customer-support prompts and assigns a single classification intent (cancel_ticket, customer_service, or report_payment_issue). + Perfect for intent-classification or help-desk automation training.""", + topics=["Technical Issues", "Billing Queries", "Payment queries"], + default_examples=[ + { + "Prompt": "I have received this message that I owe $300 and I was instructed to pay the bill online. I already paid this amount and I am wondering why I received this message.", + "Completion": "report_payment_issue" + }, + { + "Prompt": "I will not be able to attend the presentation and would like to cancel my rsvp.", + "Completion": "cancel_ticket" + }, + { + "Prompt": "I am having questions regarding the exact time, location, and requirements of the event and would like to talk to customer service.", + "Completion": "Customer_service" + } + ] + , + prompt= """ + Generate authentic customer support ticket interactions that have a user query and system response. + For each user query, the system generates a keyword that is used to forward the user to the specific subsystem. + Requirements for user queries: + - Use professional, respectful language + - Follow standard customer service best practices + Each response should be a single id from the following list: + cancel_ticket,customer_service,report_payment_issue + Here are the explanations of the responses: + cancel_ticket means that the customer wants to cancel the ticket. + customer_service means that customer wants to talk to customer service. + report_payment_issue means that the customer is facing payment issues and wants to be forwarded to the billing department to resolve the issue. - """, + """, + schema=None + ) + } - schema=None - ), -} USE_CASE_CONFIGS_EVALS = { @@ -818,104 +706,129 @@ class UseCaseMetadataEval(BaseModel): ), UseCase.CREDIT_CARD_DATA: UseCaseMetadataEval( - name="Crdit Card Data", - + name="Credit Card Data", + + default_examples=[ + { + "score": 10, + "justification": """- No privacy violations were detected (no PII leakage). +- All fields adhere to the required formatting (e.g., integers where necessary, valid `MONTHS_BALANCE`). +- The data series for each ID is ordered correctly by `MONTHS_BALANCE`, contains no duplicates, and the `STATUS` values progress logically over time. +- Cross-column consistency is maintained: + - `FLAG_OWN_REALTY` aligns with `NAME_HOUSING_TYPE`. + - Unemployed individuals (`DAYS_EMPLOYED` > 0) have appropriately lower incomes. + - `OCCUPATION_TYPE` matches `NAME_INCOME_TYPE`. + - `CNT_FAM_MEMBERS` is consistent with `CNT_CHILDREN`. +- No other critical errors were found. +""" + } + ], + prompt=""" + Evaluate the quality of the provided synthetic credit data and return a score between 1 and 10. The score should reflect how well the data adheres to the following criteria. + + Here is the context about the dataset: + + Credit score cards are a common risk control method in the financial industry. It uses personal information and data submitted by credit card applicants to predict the probability of future defaults and credit card borrowings. The bank is able to decide whether to issue a credit card to the applicant. Credit scores can objectively quantify the magnitude of risk. + Generally speaking, credit score cards are based on historical data. Once encountering large economic fluctuations, past models may lose their original predictive power. With the development of machine learning, more predictive methods like Boosting and Random Forests have been introduced, but they often lack transparency. + + The dataset is a **single flat table** where each row represents one month of a user's credit history. User information is repeated for each monthly record. + + **Fields:** + - **ID**: Unique client number (e.g., 100001). This ID is the same across multiple rows for the same user. + - **CODE_GENDER**: Gender ('F' or 'M'). + - **FLAG_OWN_CAR**: Car ownership ('Y' or 'N'). + - **FLAG_OWN_REALTY**: Property ownership ('Y' or 'N'). + - **CNT_CHILDREN**: Number of children (0 or more). + - **AMT_INCOME_TOTAL**: Annual income. + - **NAME_INCOME_TYPE**: Income category (e.g., 'Commercial associate', 'State servant'). + - **NAME_EDUCATION_TYPE**: Education level (e.g., 'Higher education', 'Secondary'). + - **NAME_FAMILY_STATUS**: Marital status (e.g., 'Married', 'Single'). + - **NAME_HOUSING_TYPE**: Way of living. + - **DAYS_BIRTH**: Birthday, counted backwards from the current day (0). + - **DAYS_EMPLOYED**: Start date of employment, counted backwards from the current day(0). A positive value means the person is currently unemployed. + - **FLAG_MOBIL**: Has a mobile phone ('Y'/'N'). + - **FLAG_WORK_PHONE**: Has a work phone ('Y'/'N'). + - **FLAG_PHONE**: Has a phone ('Y'/'N'). + - **FLAG_EMAIL**: Has an email ('Y'/'N'). + - **OCCUPATION_TYPE**: Occupation (e.g., 'Manager', 'Sales staff'). + - **CNT_FAM_MEMBERS**: Family size (1 or more). + - **MONTHS_BALANCE**: The record month, relative to the present (0 is current, -1 is previous, etc.). + - **STATUS**: Must be one of ['0', '1', '2', '3', '4', '5', 'C', 'X']. + - **Values**: 0: 1-29 days past due, 1: 30-59, 2: 60-89, 3: 90-119, 4: 120-149, 5: 150+ days overdue/bad debt, C: Paid off, X: No loan. + + Evaluate whether the data adheres to the following guidelines: + + **Privacy Guidelines:** + - Ensure fictitious entries do not leak real PII. + + **Formatting Guidelines:** + - `CNT_CHILDREN`, `AMT_INCOME_TOTAL`, `DAYS_BIRTH`, `DAYS_EMPLOYED` must be integers. + - `MONTHS_BALANCE` must be an integer ≤ 0. + - Ensure no other formatting inconsistencies exist. + + **Data Series Guidelines (for rows with the same ID):** + - Rows for a single ID must be ordered chronologically by `MONTHS_BALANCE` (e.g., from -60 to 0). + - No duplicate `MONTHS_BALANCE` values for a single ID. + - The time-series data for an ID must be logical (e.g., records shouldn't start with a high delinquency number like '2'; they should start from '0', 'C', or 'X'). + - Consecutive 'C' statuses are allowed. + - Ensure no other inconsistencies exist within a user's data series. + + **Cross-Column Guidelines:** + - Check for inconsistencies such as: + - If `FLAG_OWN_REALTY` is "Y", `NAME_HOUSING_TYPE` must **not** be "Rented apartment". + - If `DAYS_EMPLOYED` > 0 (unemployed), `AMT_INCOME_TOTAL` should be low (e.g., ≤ $50,000). + - `OCCUPATION_TYPE` must align with `NAME_INCOME_TYPE` (e.g., "Pensioner" occupation for "Pensioner" income type). + - `CNT_FAM_MEMBERS` ≥ `CNT_CHILDREN` + 1 (to account for at least one parent). + - Variables like `DAYS_BIRTH`, `DAYS_EMPLOYED`, and `OCCUPATION_TYPE` must be reasonable when considered together. + - Ensure no other cross-field inconsistencies exist. + + **Scoring Workflow:** + Start at 10 and deduct points for violations: + - **Subtract 2 points** for any Privacy Guideline violations. + - **Subtract 1 point** for any Formatting Guideline violations. + - **Subtract 4 points** for any Data Series Guideline violations. + - **Subtract 1 point** for any Cross-Column Guideline violations. + - **Subtract 2 points** for any other problems not listed above. + The minimum score is 1. If any critical errors are present (e.g., missing `ID`, PII leakage), the score should be capped at 1. + + Provide a score from 1-10 for the data and list your justifications. + """ +), + UseCase.TICKETING_DATASET: UseCaseMetadataEval( + name="Ticketing Dataset", default_examples=[ { - "score": 10, - "justification": """- No privacy violations detected (no PII leakage). - - All fields adhere to formatting requirements (integers where required, valid `MONTHS_BALANCE`, etc.). - - Cross-row entries are ordered correctly, no duplicates, and statuses progress logically (e.g., "0" → "1" → "2"). - - Cross-column consistency: - - `FLAG_OWN_REALTY="Y"` aligns with `NAME_HOUSING_TYPE`. - - Unemployed (`DAYS_EMPLOYED > 0`) have lower incomes. - - `OCCUPATION_TYPE` matches `NAME_INCOME_TYPE`. - - `CNT_FAM_MEMBERS` ≥ `CNT_CHILDREN` + 1. - - No other critical errors. + "score": 5, + "justification": """ + The query is professionally written, respectful, and follows customer service best practices. + The response 'report_payment_issue' is one of the allowed keywords. + The matching between the query and response is perfect according to the provided definitions. + + """}, + { + "score": 3, + "justification": """ + The query is professionally written and respectful. + The response 'cancel_ticket' is one of the allowed keywords. + While the response uses a valid keyword, it doesn't match the most appropriate category for the specific query content. """ + }, - } ], - prompt = """ - Evaluate the quality of the provided synthetic credit data and return a score between 1 and 10. The score should reflect how well the data adheres to the following criteria: - - Here is the context about the dataset: - - Credit score cards are a common risk control method in the financial industry. It uses personal information and data submitted by credit card applicants to predict the probability of future defaults and credit card borrowings. The bank is able to decide whether to issue a credit card to the applicant. Credit scores can objectively quantify the magnitude of risk. - Generally speaking, credit score cards are based on historical data. Once encountering large economic fluctuations. Past models may lose their original predictive power. Logistic model is a common method for credit scoring. Because Logistic is suitable for binary classification tasks and can calculate the coefficients of each feature. In order to facilitate understanding and operation, the score card will multiply the logistic regression coefficient by a certain value (such as 100) and round it. - At present, with the development of machine learning algorithms. More predictive methods such as Boosting, Random Forest, and Support Vector Machines have been introduced into credit card scoring. However, these methods often do not have good transparency. It may be difficult to provide customers and regulators with a reason for rejection or acceptance. - - - The dataset consists of two tables: `User Records` and `Credit Records`, merged by `ID`. The output must create field values with the following specifications: - - User Records Fields (static per user): - - ID: Unique client number (e.g., 100001, 100002). - - CODE_GENDER: Gender ('F' or 'M'). - - FLAG_OWN_CAR: Car ownership ('Y' or 'N'). - - FLAG_OWN_REALTY: Property ownership ('Y' or 'N'). - - CNT_CHILDREN`: Number of children (0 or more). - - AMT_INCOME_TOTAL`: Annual income. - - NAME_INCOME_TYPE`: Income category (e.g., 'Commercial associate', 'State servant'). - - NAME_EDUCATION_TYPE`: Education level (e.g., 'Higher education', 'Secondary'). - - NAME_FAMILY_STATUS`: Marital status (e.g., 'Married', 'Single'). - - NAME_HOUSING_TYPE`: Way of living. - - DAYS_BIRTH`: Birthday Count backwards from current day (0), -1 means yesterday. - - DAYS_EMPLOYED: Start date of employment Count backwards from current day(0). If positive, it means the person currently unemployed. (negative for employed; positive for unemployed). - - FLAG_MOBIL: Is there a mobile phone ('Y'/'N') - - FLAG_WORK_PHONE: Is there a work phone ('Y'/'N') - - FLAG_PHONE: Is there a phone ('Y'/'N') - - FLAG_EMAIL: Is there an email ('Y'/'N') - - OCCUPATION_TYPE: Occupation (e.g., 'Manager', 'Sales staff'). - - CNT_FAM_MEMBERS: Family size (1 or more). - - Credit Records Fields (nested array): - - ID: needs to be the same as the User Records Fields ID. - - MONTHS_BALANCE: Refers to Record month. The month of the extracted data is the starting point, backwards, 0 is the current month, -1 is the previous month, and so on. - - STATUS: - Must be one of ['0', '1', '2', '3', '4', '5', 'C', 'X']. - Values description: 0: 1-29 days past due 1: 30-59 days past due 2: 60-89 days overdue 3: 90-119 days overdue 4: 120-149 days overdue 5: Overdue or bad debts, write-offs for more than 150 days C: paid off that month X: No loan for the month - - - Evaluate whether the data adhere to the following guidelines: - - Privacy guidelines: - - Allow ficticious PII entries that do not leak PII. + prompt= """ + You are given a user query for a ticketing support system and the system responses which is a keyword that is used to forward the user to the specific subsystem. + Evaluate whether the queries: + - Use professional, respectful language + - Follow standard customer service best practices + Evaluate whether the responses use only one of the the following keywords: cancel_ticket,customer_service,report_payment_issue + Evaluate whether the solutions and responses are correctly matched based on the following definitions: + cancel_ticket means that the customer wants to cancel the ticket. + customer_service means that customer wants to talk to customer service. + report_payment_issue means that the customer is facing payment issues and wants to be forwarded to the billing department to resolve the issue. + Give a score of 1-5 based on the following instructions: + If the responses don’t match the four keywords give always value 1. + Rate the quality of the queries and responses based on the instructions give a rating between 1 to 5. - Formatting guidelines: - - `CNT_CHILDREN`, `AMT_INCOME_TOTAL`, `DAYS_BIRTH`, `DAYS_EMPLOYED`, etc., must be integers. - - `MONTHS_BALANCE` must be an integer 0 or less. - - Ensure no other formatting problems or inconsistencies appear that are not listed above. - - Cross-row entries guidelines (applies to Credit Records): - - Entries must be ordered from oldest (e.g. `MONTHS_BALANCE=-60`) to newest (`MONTHS_BALANCE=0`). - - No duplicate `MONTHS_BALANCE` values for a single client. - - Consecutive STATUS=C is allowed since it indicates that each monthly payment and amount owned is paid off. - - The time-series credit record entries need to be logical and consistent when read in the correct sequence as months progress from negative to 0. - - Ensure the records dont start from deliquency 2 but rather from 0, C or X. - - Ensure there are no other Credit Records inconsistencies appear that not listed above. - - - Cross-Column guidelines: - - Check cross-column inconsistencies such as: - If `FLAG_OWN_REALTY="Y"`, `NAME_HOUSING_TYPE` must **not** be "Rented apartment". - If `DAYS_EMPLOYED > 0` (unemployed), `AMT_INCOME_TOTAL` should be lower (e.g., ≤ $50,000). - `OCCUPATION_TYPE` must align with `NAME_INCOME_TYPE` (e.g., "Pensioner" cannot have "Manager" as occupation). - `CNT_FAM_MEMBERS` ≥ `CNT_CHILDREN` + 1 (accounting for at least one parent). - DAYS_BIRTH, DAYS_EMPLOYED, OCCUPATION_TYPE and other variables are reasonable when considered together. - - Ensure there are no other cross-field Credit Records inconsistencies appear that are not listed above. - - - Scoring Workflow: - Start at 10, deduct points for violations: - Subtract 2 points for any Privacy guidelines violations. - Subtract 1 point for any formatting guidelines violations. - Subtract 1 point for any cross-column violations. - Subtract 4 points for any Cross-row guidelines guidelines violations. - Subtract 2 points for any other problem with the generated data not listed above. - Cap minimum score score at 1 if any critical errors (e.g., missing `ID`, PII, or invalid `STATUS`). - - - Give a score rating 1-10 for the given data. If there are more than 9 points to subtract use 1 as the absolute minimum scoring. List all justification as list. """ ) } diff --git a/app/core/database.py b/app/core/database.py index f5c7682c..03d5947c 100644 --- a/app/core/database.py +++ b/app/core/database.py @@ -71,8 +71,9 @@ def init_db(self): job_id TEXT, job_name TEXT UNIQUE, job_status TEXT, - job_creator_name TEXT - + job_creator_name TEXT, + completed_rows INTEGER + ) """) @@ -163,8 +164,8 @@ def save_generation_metadata(self, metadata: Dict) -> int: custom_prompt, model_parameters, input_key, output_key, output_value, generate_file_name, display_name, local_export_path, hf_export_path, s3_export_path, num_questions, total_count, topics, examples, - schema, doc_paths, input_path, job_id, job_name, job_status, job_creator_name - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + schema, doc_paths, input_path, job_id, job_name, job_status, job_creator_name, completed_rows + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """ values = ( @@ -194,7 +195,8 @@ def save_generation_metadata(self, metadata: Dict) -> int: metadata.get('job_id', None), metadata.get('job_name', None), metadata.get('job_status', None), - metadata.get('job_creator_name', None) + metadata.get('job_creator_name', None), + metadata.get('completed_rows', None) ) cursor.execute(query, values) @@ -212,7 +214,7 @@ def save_generation_metadata(self, metadata: Dict) -> int: print(f"Error saving metadata to database: {str(e)}") raise - def update_job_generate(self, job_name: str, generate_file_name: str, local_export_path: str, timestamp: str, job_status): + def update_job_generate(self, job_name: str, generate_file_name: str, local_export_path: str, timestamp: str, job_status, completed_rows): """Update job generate with retry mechanism""" max_retries = 3 retry_delay = 1 # seconds @@ -244,11 +246,12 @@ def update_job_generate(self, job_name: str, generate_file_name: str, local_expo SET generate_file_name = ?, local_export_path = ?, timestamp = ?, - job_status = ? + job_status = ?, + completed_rows = ? WHERE job_name = ? AND job_name IS NOT NULL AND job_name != '' - """, (generate_file_name, local_export_path, timestamp, job_status, job_name)) + """, (generate_file_name, local_export_path, timestamp, job_status, completed_rows,job_name)) rows_affected = cursor.rowcount conn.commit() @@ -818,6 +821,39 @@ def get_all_generate_metadata(self) -> List[Dict]: print(f"Error retrieving all metadata: {str(e)}") return [] + def get_paginated_generate_metadata_light(self, page: int, page_size: int) -> Tuple[int, List[Dict]]: + """Retrieve paginated metadata with only fields needed for list view""" + try: + with self.get_connection() as conn: + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + # Get total count + count_query = "SELECT COUNT(*) FROM generation_metadata" + cursor.execute(count_query) + total_count = cursor.fetchone()[0] + + # Get only fields needed for list view + offset = (page - 1) * page_size + query = """ + SELECT + id, timestamp, display_name, generate_file_name, model_id, + num_questions, total_count, use_case, job_status, + local_export_path, hf_export_path, completed_rows + FROM generation_metadata + ORDER BY timestamp DESC + LIMIT ? OFFSET ? + """ + cursor.execute(query, (page_size, offset)) + + results = [dict(row) for row in cursor.fetchall()] + return total_count, results + + except Exception as e: + print(f"Error retrieving paginated metadata: {str(e)}") + return 0, [] + + def get_paginated_generate_metadata(self, page: int, page_size: int) -> Tuple[int, List[Dict]]: """Retrieve paginated metadata entries for generations""" try: diff --git a/app/core/model_handlers.py b/app/core/model_handlers.py index 7df26ad4..92391b8c 100644 --- a/app/core/model_handlers.py +++ b/app/core/model_handlers.py @@ -316,9 +316,18 @@ def _handle_openai_request(self, prompt: str): pool=5.0 ) + # Configure httpx client with certificate verification for private cloud + if os.path.exists("/etc/ssl/certs/ca-certificates.crt"): + http_client = httpx.Client( + verify="/etc/ssl/certs/ca-certificates.crt", + timeout=timeout_config + ) + else: + http_client = httpx.Client(timeout=timeout_config) + client = OpenAI( api_key=os.getenv('OPENAI_API_KEY'), - timeout=timeout_config + http_client=http_client ) completion = client.chat.completions.create( model=self.model_id, @@ -380,10 +389,19 @@ def _handle_caii_request(self, prompt: str): pool=5.0 ) + # Configure httpx client with certificate verification for private cloud + if os.path.exists("/etc/ssl/certs/ca-certificates.crt"): + http_client = httpx.Client( + verify="/etc/ssl/certs/ca-certificates.crt", + timeout=timeout_config + ) + else: + http_client = httpx.Client(timeout=timeout_config) + client_ca = OpenAI( base_url=caii_endpoint, api_key=API_KEY, - timeout=timeout_config # Use the comprehensive timeout configuration + http_client=http_client ) completion = client_ca.chat.completions.create( diff --git a/app/core/prompt_templates.py b/app/core/prompt_templates.py index 5f8d9fab..7500177f 100644 --- a/app/core/prompt_templates.py +++ b/app/core/prompt_templates.py @@ -572,8 +572,7 @@ def get_freeform_eval_prompt(model_id: str, examples_str = PromptHandler.format_examples_eval(examples) elif examples == [] or examples == None: - examples_str = PromptHandler.format_examples_eval(USE_CASE_CONFIGS_EVALS[use_case].default_examples) - + examples_str = str(USE_CASE_CONFIGS_EVALS[use_case].default_examples) base_prompt = """ You are a brilliant judge on evaluating a set of data with fields and corresponding values Follow the given instructions to understand the structure of given data and evaluate it based on parameters defined for you.""" final_instruction = f"""data row: {row} @@ -668,231 +667,180 @@ def get_freeform_eval_prompt(model_id: str, # else: # final_prompt = "\n" + final_instruction # return final_prompt - + + @staticmethod def create_custom_prompt( - model_id: str, - custom_prompt: str, - example_path: str | None = None, -) -> str: - """ - Create a custom prompt for a language model, optionally including dataset analysis. - - Args: - model_id: The ID of the model to create the prompt for - custom_prompt: The base custom prompt text - example_path: Optional path to an example dataset - - Returns: - A formatted prompt suitable for the specified model - """ + model_id: str, + custom_prompt: str, + example_path: str | None = None, + example: Optional[List[Dict[str, Any]]] = None, + ) -> str: summary_block = "" example_block = "" - if example_path: - print(f"Loading example data from: {example_path}") + if example_path or example: try: - df = DataLoader.load(example_path) - #print(f"Loaded DataFrame with shape: {df.shape}") - - # Apply type inference to improve analysis - df = DataLoader.infer_dtypes(df) - - if "error_message" in df.columns and len(df.columns) == 1: - # Data loading failed - print(f"Error loading data: {df['error_message'][0]}") - # Keep summary and example blocks as empty strings - elif not df.empty: - # ---------- build summary block ---------- - try: - print("Analyzing data...") - summary_dict = DataAnalyser.analyse(df) - - # Create a more structured summary with explanations - summary_block = ( - "\n" - "INSTRUCTIONS: The following analysis provides key insights about the dataset that should guide your synthetic data generation. Use these signals to match distributions and relationships when generating synthetic data.\n\n" - ) - - # Add section for columns classification - if "columns" in summary_dict: - summary_block += (""" - ## Column Types\n - "These are all columns identified in the dataset in given specific order:\n\n - Make sure to provide definitions of each column in the same order as they are in the dataset. - Don't change or skip any column name or order. - """) - - - - - summary_block += "\n".join(f"- {col}" for col in summary_dict["columns"]) + "\n\n" - - # Add section for statistical analysis - if "statistical_analysis" in summary_dict: - summary_block += ( - "## Statistical Analysis\n" - "These statistics describe the distributions of values in the dataset:\n\n" + if example_path: + print(f"Loading example data from: {example_path}") + df = DataLoader.load(example_path) + elif example: + df = pd.DataFrame(example) + else: + df = None + + if df is not None: + df = DataLoader.infer_dtypes(df) + + if "error_message" in df.columns and len(df.columns) == 1: + print(f"Error loading data: {df['error_message'][0]}") + elif not df.empty: + try: + print("Analyzing data...") + summary_dict = DataAnalyser.analyse(df) + summary_block = ( + "\n" + "INSTRUCTIONS: The following analysis provides key insights about the dataset that should guide your synthetic data generation. Use these signals to match distributions and relationships when generating synthetic data.\n\n" ) - - if "numeric" in summary_dict["statistical_analysis"]: - summary_block += ( - "### Numeric Statistics\n" - "Key statistics for numeric columns (mean, median, min, max, etc.):\n" - f"{json.dumps(summary_dict['statistical_analysis']['numeric'], indent=2)}\n\n" - ) - - if "categorical" in summary_dict["statistical_analysis"]: + if "columns" in summary_dict: + summary_block += (""" + ## Column Types\n + "These are all columns identified in the dataset in given specific order:\n\n + Make sure to provide definitions of each column in the same order as they are in the dataset. + Don't change or skip any column name or order. + """) + summary_block += "\n".join(f"- {col}" for col in summary_dict["columns"]) + "\n\n" + + if "statistical_analysis" in summary_dict: + summary_block += "## Statistical Analysis\n" + if "numeric" in summary_dict["statistical_analysis"]: + summary_block += ( + "### Numeric Statistics\n" + f"{json.dumps(summary_dict['statistical_analysis']['numeric'], indent=2)}\n\n" + ) + if "categorical" in summary_dict["statistical_analysis"]: + summary_block += ( + "### Categorical Statistics\n" + f"{json.dumps(summary_dict['statistical_analysis']['categorical'], indent=2)}\n\n" + ) + if "datetime" in summary_dict["statistical_analysis"]: + summary_block += ( + "### DateTime Statistics\n" + f"{json.dumps(summary_dict['statistical_analysis']['datetime'], indent=2)}\n\n" + ) + + if "cross_row_relationship" in summary_dict: summary_block += ( - "### Categorical Statistics\n" - "Distribution of values in categorical columns:\n" - f"{json.dumps(summary_dict['statistical_analysis']['categorical'], indent=2)}\n\n" + "## Cross-Row Relationships\n" + f"{json.dumps(summary_dict['cross_row_relationship'], indent=2)}\n\n" ) - - if "datetime" in summary_dict["statistical_analysis"]: + if "cross_column_relationship" in summary_dict: summary_block += ( - "### DateTime Statistics\n" - "Temporal patterns and ranges in datetime columns:\n" - f"{json.dumps(summary_dict['statistical_analysis']['datetime'], indent=2)}\n\n" + "## Cross-Column Relationships\n" + f"{json.dumps(summary_dict['cross_column_relationship'], indent=2)}\n\n" ) - # Add section for cross-row relationships - if "cross_row_relationship" in summary_dict: - summary_block += ( - "## Cross-Row Relationships\n" - "These insights describe patterns across rows in the dataset:\n\n" - f"{json.dumps(summary_dict['cross_row_relationship'], indent=2)}\n\n" + summary_block += "\n" + print("Data analysis completed successfully.") + except Exception as e: + print(f"Error in data analysis: {str(e)}") + + try: + print("Creating CSV snippet...") + csv_snippet = SummaryFormatter.first_rows_block(df) + example_block = ( + "\n" + "INSTRUCTIONS: The CSV snippet shows the first 10 rows of the " + "original dataset. Preserve this column order, header names, " + "and data types while creating new rows. " + "Use this to create a comprehensive list of all columns and their definitions. " + "Make sure the list covers all details and columns which will be required " + "to create data.\n" + f"{csv_snippet}\n" ) - - # Add section for cross-column relationships - if "cross_column_relationship" in summary_dict: - summary_block += ( - "## Cross-Column Relationships\n" - "These insights describe correlations and dependencies between columns:\n\n" - f"{json.dumps(summary_dict['cross_column_relationship'], indent=2)}\n\n" - ) - - # Close the data summary block - summary_block += "\n" - - print("Data analysis completed successfully.") - - except Exception as e: - # Analysis failed → keep summary_block as empty string - print(f"Error in data analysis: {str(e)}") - # Do NOT add any error messages to blocks - - # ---------- build example block ---------- - try: - print("Creating CSV snippet...") - csv_snippet = SummaryFormatter.first_rows_block(df) - example_block = ( - "\n" - "INSTRUCTIONS: The CSV snippet shows the first 10 rows of the " - "original dataset. Preserve this column order, header names, " - "and data types while creating new rows. " - "Use this to create a comprehensive list of all columns and their definitions. " - "Make sure the list covers all details and columns which will be required " - "to create data.\n" - f"{csv_snippet}" - "\n" - ) - print("CSV snippet created successfully.") - except Exception as e: - # Snippet failed → keep example_block as empty string - print(f"Error creating CSV snippet: {str(e)}") - # Do NOT add any error messages to blocks + print("CSV snippet created successfully.") + except Exception as e: + print(f"Error creating CSV snippet: {str(e)}") except Exception as e: - print(f"Error processing example file: {str(e)}") - # Keep blocks as empty strings - # Do NOT add any error messages to blocks + print(f"Error processing example data: {str(e)}") - # Construct the final instruction with proper error handling for missing constants try: - - if example_path: - #Construct the final instruction final_instruction = f"""You are a brilliant prompt engineer. - Your task: **{custom_prompt}** + Your task: **{custom_prompt}** - {summary_block}{example_block}Return **only** the finished prompt that can be sent directly to a language model. - Now that you have complete information about the task, follow the below instructions to create prompt. + {summary_block}{example_block}Return **only** the finished prompt that can be sent directly to a language model. + Now that you have complete information about the task, follow the below instructions to create prompt. - - Look at column list and include all columns in your prompt with their definitions. the list should be exhaustive and cover all columns. - - Make sure to have all statistical analysis , cross-row and cross-column relationships in your prompt. - - The prmpt should be absolutely clear in its final goal and there should not be any ambiguity or vagueness in the prompt. - - The prompt should be clear and exhaustive in its column details. + - Look at column list and include all columns in your prompt with their definitions. the list should be exhaustive and cover all columns. + - Make sure to have all statistical analysis , cross-row and cross-column relationships in your prompt. + - The prmpt should be absolutely clear in its final goal and there should not be any ambiguity or vagueness in the prompt. + - The prompt should be clear and exhaustive in its column details. - - A few examples are given below for your reference - Code Generation: - - {DEFAULT_CODE_GENERATION_PROMPT} - - Lending Data Generation: - {LENDING_DATA_PROMPT} - - Make sure you just give the prompt in your response which can be directly used by large language model. - No need to give any explanation but just the prompt in same format as the example given above. - Never mention how many rows or dataset size needs to be generated in the final output. - - """ - else: - - final_instruction = f"""You are a brilliant prompt engineer. - Your task: **{custom_prompt}** - - {summary_block}{example_block} - - Return a well-crafted prompt that focuses on: - - The core task objective - - Clear and exhaustive column details - - Key aspects to consider or maintain - - Special requirements for the task + A few examples are given below for your reference + Code Generation: - A few examples are given below for your reference - Code Generation: + {DEFAULT_CODE_GENERATION_PROMPT} - {DEFAULT_CODE_GENERATION_PROMPT} + Lending Data Generation: + {LENDING_DATA_PROMPT} - Text to SQL: - {DEFAULT_TEXT2SQL_PROMPT} - Make sure you just give the prompt in your response which can be directly used by large language model. No need to give any explanation but just the prompt in same format as the example given above. Never mention how many rows or dataset size needs to be generated in the final output. - """ + + """ + else: + final_instruction = f"""You are a brilliant prompt engineer. + Your task: **{custom_prompt}** + + {summary_block}{example_block} + + Return a well-crafted prompt that focuses on: + - The core task objective + - Clear and exhaustive column details + - Key aspects to consider or maintain + - Special requirements for the task + + A few examples are given below for your reference + Code Generation: + + {DEFAULT_CODE_GENERATION_PROMPT} + + Text to SQL: + {DEFAULT_TEXT2SQL_PROMPT} + + Make sure you just give the prompt in your response which can be directly used by large language model. + No need to give any explanation but just the prompt in same format as the example given above. + Never mention how many rows or dataset size needs to be generated in the final output. + """ except Exception as e: print(f"Error constructing instruction template: {str(e)}") - # Fallback to a simpler template that still includes any successful blocks final_instruction = f"""You are a brilliant prompt engineer. - Your task: **{custom_prompt}** - - {summary_block}{example_block} - - Return a well-crafted prompt that focuses on: - - The core task objective - - Clear and exhaustive column details - - Key aspects to consider or maintain - - Special requirements for the task - - A few examples are given below for your reference - Code Generation: - - {DEFAULT_CODE_GENERATION_PROMPT} - - Text to SQL: - {DEFAULT_TEXT2SQL_PROMPT} - - Make sure you just give the prompt in your response which can be directly used by large language model. - No need to give any explanation but just the prompt in same format as the example given above. - Never mention how many rows or dataset size needs to be generated in the final output. - """ + Your task: **{custom_prompt}** + + {summary_block}{example_block} + + Return a well-crafted prompt that focuses on: + - The core task objective + - Clear and exhaustive column details + - Key aspects to consider or maintain + - Special requirements for the task + + A few examples are given below for your reference + Code Generation: + + {DEFAULT_CODE_GENERATION_PROMPT} + + Text to SQL: + {DEFAULT_TEXT2SQL_PROMPT} + + Make sure you just give the prompt in your response which can be directly used by large language model. + No need to give any explanation but just the prompt in same format as the example given above. + Never mention how many rows or dataset size needs to be generated in the final output. + """ - # Format according to model family try: family = get_model_family(model_id) - if family == ModelFamily.LLAMA: return "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n" \ f"{final_instruction}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>" @@ -903,21 +851,19 @@ def create_custom_prompt( elif family == ModelFamily.QWEN: system = "You are Qwen, created by Alibaba Cloud. You are a helpful assistant." return f"""<|im_start|>system - {system}<|im_end|> - <|im_start|>user + {system}<|im_end|> + <|im_start|>user - {final_instruction}<|im_end|> - <|im_start|>assistant - """ + {final_instruction}<|im_end|> + <|im_start|>assistant + """ else: - # Default format if model family is unknown return "\n" + final_instruction except Exception as e: print(f"Error formatting for model family: {str(e)}") - # Return the raw instruction if formatting fails return final_instruction + - @staticmethod def generate_result_prompt(model_id: str, @@ -999,11 +945,11 @@ def get_freeform_prompt(model_id: str, use_case: UseCase, topic: str, num_questions: int, - omit_questions: List, - example_custom: List[Dict[str, Any]], - example_path: Optional[str], - custom_prompt = Optional[str], - schema = Optional[str] + omit_questions: List, + example_custom: Optional[List[Dict[str, Any]]] = None, # <- now optional + example_path: Optional[str] = None, + custom_prompt: Optional[str] = None, + schema: Optional[str] = None, ) -> str: if example_path: @@ -1184,10 +1130,11 @@ def build_generate_result_prompt(model_id: str, @staticmethod def build_custom_prompt(model_id: str, custom_prompt = Optional[str], - example_path= Optional[str] + example_path= Optional[str], + example = Optional[List[Dict[str, Any]]] ) -> str: - return ModelPrompts.create_custom_prompt(model_id, custom_prompt, example_path) + return ModelPrompts.create_custom_prompt(model_id, custom_prompt, example_path, example) @staticmethod def build_freeform_prompt(model_id: str, diff --git a/app/main.py b/app/main.py index 1ce5d47a..1121786d 100644 --- a/app/main.py +++ b/app/main.py @@ -708,9 +708,10 @@ async def create_custom_prompt(request: CustomPromptRequest, request_id = None): prompt = PromptBuilder.build_custom_prompt( model_id=request.model_id, custom_prompt=request.custom_prompt, - example_path= request.example_path + example_path= request.example_path, + example = request.example ) - print(prompt) + #print(prompt) prompt_gen = model_handler.generate_response(prompt, request_id=request_id) return {"generated_prompt":prompt_gen} @@ -767,14 +768,15 @@ async def get_model_id_filtered(): async def get_use_cases(): """Get available use cases""" return { - "usecases": [ - {"id": UseCase.CODE_GENERATION, "name": "Code Generation"}, - {"id": UseCase.TEXT2SQL, "name": "Text to SQL"}, - {"id": UseCase.CUSTOM, "name": "Custom"}, - {"id": UseCase.LENDING_DATA, "name": "Lending Data"}, - {"id": UseCase.CREDIT_CARD_DATA, "name": "Credit Card Data"}, - ] - } + "usecases": [ + {"id": UseCase.CODE_GENERATION, "name": "Code Generation", "description": "Generates paired programming questions and answers with runnable, well-formatted code plus explanations. Ideal for building Q-and-A datasets across programming languages like Python.", "tag": ["Supervised Finetuning", "Data Generation"]}, + {"id": UseCase.TEXT2SQL, "name": "Text to SQL", "description": "Creates natural-language questions matched to clean, executable SQL queries spanning basics, joins, aggregates, subqueries, and window functions. Great for training and evaluation.", "tag": ["Supervised Finetuning", "Data Generation"]}, + {"id": UseCase.CUSTOM, "name": "Custom", "description": "A blank template meant for any user-defined synthetic data task.", "tag": []}, + {"id": UseCase.LENDING_DATA, "name": "Lending Data", "description": "Produces realistic LendingClub-style loan records—complete borrower, loan, and credit-profile fields—while respecting privacy and intricate cross-field logic (grades, DTI, employment, etc.).", "tag": ["Data Generation", "Tabular Data"]}, + {"id": UseCase.CREDIT_CARD_DATA, "name": "Credit Card Data", "description": "Builds user profiles with chronological credit-status histories, maintaining ID consistency and evolving payment behavior. Supports training for credit scoring models without real user data.", "tag": ["Data Generation", "Tabular Data"]}, + {"id": UseCase.TICKETING_DATASET, "name": "Ticketing Dataset", "description": "Generates support queries with labelled ticket classification intent (cancel_ticket, customer_service, or report_payment_issue). Perfect for intent-classification or help-desk automation training.", "tag": ["Data Generation", "Intent Classification"]}, + ] +} @app.get("/model/parameters", include_in_schema=True) async def get_model_parameters() -> Dict: @@ -985,7 +987,8 @@ async def get_generation_history( db_manager.update_job_statuses_generate(job_status_map) # Get paginated data - total_count, results = db_manager.get_paginated_generate_metadata(page, page_size) + #total_count, results = db_manager.get_paginated_generate_metadata(page, page_size) + total_count, results = db_manager.get_paginated_generate_metadata_light(page, page_size) # Return in the structure expected by the frontend return { @@ -1420,13 +1423,22 @@ async def perform_upgrade(): # 2. Database migrations try: - db_success, db_message = await alembic_manager.handle_database_upgrade() - if db_success: - db_upgraded = True - messages.append(db_message) - else: - messages.append(f"Database upgrade failed: {db_message}") - raise HTTPException(status_code=500, detail=db_message) + # In your upgrade endpoint, you can add this debug line: + print(f"Current working directory: {os.getcwd()}") + print(f"Alembic.ini exists: {os.path.exists('alembic.ini')}") + print("--- Starting database migration via external script ---") + # Use `uv run` to ensure the script runs within the project's virtual environment + # This is more robust than just calling 'python' + result = subprocess.run( + ["uv", "run", "python", "run_migrations.py"], + capture_output=True, + text=True, + check=True # This will raise CalledProcessError on failure + ) + + print(result.stdout) # Log the output from the script + db_upgraded = True + messages.append("Database migration check completed successfully.") except Exception as e: messages.append(f"Database migration failed: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/migrations/alembic_manager.py b/app/migrations/alembic_manager.py index 51538c66..7e1b257a 100644 --- a/app/migrations/alembic_manager.py +++ b/app/migrations/alembic_manager.py @@ -1,61 +1,84 @@ # app/migrations/alembic_manager.py -from alembic.config import Config -from alembic import command -from alembic.script import ScriptDirectory -from alembic.runtime.migration import MigrationContext -from pathlib import Path -import os -from sqlalchemy import create_engine +import subprocess class AlembicMigrationManager: def __init__(self, db_path: str = None): - """Initialize Alembic with the same database path as DatabaseManager""" - self.app_path = Path(__file__).parent.parent.parent - - if db_path is None: - db_path = os.path.join(self.app_path, "metadata.db") - self.db_path = db_path - - # Initialize Alembic config - self.alembic_cfg = Config(str(self.app_path / "alembic.ini")) - self.alembic_cfg.set_main_option('script_location', str(self.app_path / "alembic")) - self.alembic_cfg.set_main_option('sqlalchemy.url', f'sqlite:///{db_path}') - - # Create engine for version checks - self.engine = create_engine(f'sqlite:///{db_path}') - - async def get_db_version(self) -> str: - """Get current database version""" - with self.engine.connect() as conn: - context = MigrationContext.configure(conn) - return context.get_current_revision() + """Initialize with database path (kept for interface compatibility)""" + self.db_path = db_path or "metadata.db" async def handle_database_upgrade(self) -> tuple[bool, str]: """ - Handle database migrations carefully to avoid disrupting existing data + Simple database migration using alembic upgrade head + No directory changes needed - already in project root """ try: - # First check if alembic_version table exists - try: - version = await self.get_db_version() - if version is None: - # Database exists but no alembic version - stamp current - command.stamp(self.alembic_cfg, "head") - return True, "Existing database stamped with current version" - except Exception: - # No alembic_version table - stamp current - command.stamp(self.alembic_cfg, "head") - return True, "Existing database stamped with current version" + # Run upgrade head - we're already in the right directory + result = subprocess.run( + ["alembic", "upgrade", "head"], + capture_output=True, + text=True, + check=True + ) - # Now check for and apply any new migrations - script = ScriptDirectory.from_config(self.alembic_cfg) - head_revision = script.get_current_head() + # Check if anything was actually upgraded + if "Running upgrade" in result.stdout: + return True, f"Database upgraded successfully: {result.stdout.strip()}" + else: + return True, "Database is already up to date" + + except subprocess.CalledProcessError as e: + error_msg = e.stderr or e.stdout or str(e) + return False, f"Database upgrade failed: {error_msg}" + except Exception as e: + return False, f"Error during database upgrade: {str(e)}" + + async def get_migration_status(self) -> dict: + """Get detailed migration status for debugging""" + try: + # Get current version + current_result = subprocess.run( + ["alembic", "current"], + capture_output=True, + text=True, + check=True + ) - if version != head_revision: - command.upgrade(self.alembic_cfg, "head") - return True, "Database schema updated successfully" + # Get head version + head_result = subprocess.run( + ["alembic", "show", "head"], + capture_output=True, + text=True, + check=True + ) - return True, "Database schema is up to date" - + return { + "current": current_result.stdout.strip(), + "head": head_result.stdout.strip(), + "status": "ready" + } + + except subprocess.CalledProcessError as e: + error_msg = e.stderr or e.stdout or str(e) + return {"error": f"Command failed: {error_msg}", "status": "error"} except Exception as e: - return False, f"Error during database upgrade: {str(e)}" \ No newline at end of file + return {"error": str(e), "status": "error"} + + async def get_current_version(self) -> str: + """Get current database version using alembic current command""" + try: + result = subprocess.run( + ["alembic", "current"], + capture_output=True, + text=True, + check=True + ) + + # Extract just the version ID from output like "2b4e8d9f6c3a (head)" + import re + match = re.search(r'([a-f0-9]{12})', result.stdout) + return match.group(1) if match else "none" + + except subprocess.CalledProcessError: + return "none" + except Exception: + return "unknown" \ No newline at end of file diff --git a/app/migrations/alembic_schema_models.py b/app/migrations/alembic_schema_models.py index 3967a113..5eb735f2 100644 --- a/app/migrations/alembic_schema_models.py +++ b/app/migrations/alembic_schema_models.py @@ -35,6 +35,7 @@ class GenerationMetadataModel(Base): job_name = Column(Text, unique=True) job_status = Column(Text) job_creator_name = Column(Text) + completed_rows = Column(Integer) class EvaluationMetadataModel(Base): __tablename__ = 'evaluation_metadata' diff --git a/app/models/request_models.py b/app/models/request_models.py index ed884cdf..eaf9d7e0 100644 --- a/app/models/request_models.py +++ b/app/models/request_models.py @@ -242,6 +242,10 @@ class CustomPromptRequest(BaseModel): inference_type :Optional[str] = "aws_bedrock" caii_endpoint: Optional[str] = None example_path: Optional[str] = None + example: Optional[List[Dict[str, Any]]] = Field( + default=None, + description="JSON array where each object has the same structure (consistent columns), but the structure itself can be defined flexibly per use case" + ) custom_p:bool =True model_config = ConfigDict(protected_namespaces=(), diff --git a/app/services/synthesis_service.py b/app/services/synthesis_service.py index 5aa25bb7..e42ac3c8 100644 --- a/app/services/synthesis_service.py +++ b/app/services/synthesis_service.py @@ -1007,19 +1007,24 @@ async def generate_freeform(self, request: SynthesisRequest, job_name=None, is_d json.dump(final_output, indent=2, fp=f) self.logger.info(f"Saved {len(final_output)} results to {file_path}") - # Check if we have any critical model errors across all topics - has_critical_model_error = any( - topic_errors and any("ModelHandlerError" in error for error in topic_errors) - for _, _, topic_errors, _ in completed_topics - ) + # Find the first critical model error message + first_critical_error = None + for _, _, topic_errors, _ in completed_topics: + if topic_errors: + for error in topic_errors: + if "ModelHandlerError" in error: + first_critical_error = error + break + if first_critical_error: + break # After saving (or if no data), check for critical errors - if has_critical_model_error: + if first_critical_error: if final_output: self.logger.info(f"Saved {len(final_output)} results before failing due to model errors") else: self.logger.info("No results to save before failing due to model errors") - raise APIError("Critical model errors encountered during generation") + raise APIError(first_critical_error) # Handle custom prompt, examples and schema custom_prompt_str = PromptHandler.get_default_custom_prompt(request.use_case, request.custom_prompt) @@ -1093,7 +1098,8 @@ def json_serializable(obj): 'input_path': input_path_str, 'input_key': request.input_key, 'output_key': request.output_key, - 'output_value': request.output_value + 'output_value': request.output_value, + 'completed_rows': len(final_output) if final_output else 0 } if is_demo: @@ -1109,7 +1115,7 @@ def json_serializable(obj): generate_file_name = os.path.basename(file_path) if final_output else '' final_output_path = file_path if final_output else '' - self.db.update_job_generate(job_name, generate_file_name, final_output_path, timestamp, job_status) + self.db.update_job_generate(job_name, generate_file_name, final_output_path, timestamp, job_status, len(final_output) if final_output else 0) self.db.backup_and_restore_db() return { "status": "completed" if final_output else "failed", @@ -1157,13 +1163,14 @@ def json_serializable(obj): if saved_partial_results: # Update with actual file information for partial results generate_file_name = os.path.basename(file_path) - final_output_path = file_path + final_output_path = file_path + completed_rows = len(final_output) if final_output else 0 else: # No results saved, use empty values generate_file_name = '' final_output_path = '' - - self.db.update_job_generate(job_name, generate_file_name, final_output_path, timestamp, job_status) + completed_rows = 0 + self.db.update_job_generate(job_name, generate_file_name, final_output_path, timestamp, job_status, completed_rows = completed_rows ) raise def get_health_check(self) -> Dict: diff --git a/build/shell_scripts/build_client.sh b/build/shell_scripts/build_client.sh index dc289f3d..0ea6a63f 100644 --- a/build/shell_scripts/build_client.sh +++ b/build/shell_scripts/build_client.sh @@ -5,10 +5,13 @@ set -eox pipefail export UV_HTTP_TIMEOUT=3600 # Ensure uv is installed -if ! command -v uv &> /dev/null; then - echo "Installing uv package manager..." - curl -LsSf https://astral.sh/uv/install.sh | sh - export PATH="$HOME/.cargo/bin:$PATH" +set +e +uv --version >/dev/null 2>&1 +return_code=$? +set -e +if [ $return_code -ne 0 ]; then + echo "Installing uv package manager via pip..." + python -m pip install uv fi # Setup virtual environment and dependencies diff --git a/images/synthetic-data-studio-banner.svg b/images/synthetic-data-studio-banner.svg new file mode 100644 index 00000000..6804615e --- /dev/null +++ b/images/synthetic-data-studio-banner.svg @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/run_migrations.py b/run_migrations.py new file mode 100644 index 00000000..dc83e493 --- /dev/null +++ b/run_migrations.py @@ -0,0 +1,33 @@ +import asyncio +import sys +from pathlib import Path + +# Ensure the 'app' directory is in the Python path +ROOT_DIR = Path(__file__).parent +APP_DIR = ROOT_DIR / "app" +sys.path.append(str(ROOT_DIR)) + +from app.migrations.alembic_manager import AlembicMigrationManager + +async def main(): + """ + Initializes the migration manager and runs the database upgrade. + This will always use the latest code from disk. + """ + print("--- Running dedicated migration script ---") + # Assumes your DB file is named metadata.db in the root + db_path = str(ROOT_DIR / "metadata.db") + alembic_manager = AlembicMigrationManager(db_path) + + success, message = await alembic_manager.handle_database_upgrade() + + if not success: + print(f"Migration Error: {message}") + # Exit with a non-zero status code to indicate failure + sys.exit(1) + + print(f"Migration Success: {message}") + print("--- Migration script finished ---") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/tests/integration/test_synthesis_api.py b/tests/integration/test_synthesis_api.py index 34b81cf9..cbd60e9f 100644 --- a/tests/integration/test_synthesis_api.py +++ b/tests/integration/test_synthesis_api.py @@ -49,7 +49,7 @@ def test_generate_endpoint_with_doc_paths(): def test_generation_history(): # Patch db_manager.get_paginated_generate_metadata to return dummy metadata with pagination info - db_manager.get_paginated_generate_metadata = lambda page, page_size: ( + db_manager.get_paginated_generate_metadata_light = lambda page, page_size: ( 1, # total_count [{"generate_file_name": "qa_pairs_claude_20250210T170521148_test.json", "timestamp": "2024-02-10T12:00:00",