Skip to content

Commit 0f866fc

Browse files
authored
feat: text collecion auto save for a txt file (#4924)
1 parent 05c7ba4 commit 0f866fc

File tree

9 files changed

+76
-10
lines changed

9 files changed

+76
-10
lines changed

packages/global/core/dataset/utils.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,6 @@ export function getSourceNameIcon({
4040
export const predictDataLimitLength = (mode: TrainingModeEnum, data: any[]) => {
4141
if (mode === TrainingModeEnum.qa) return data.length * 20;
4242
if (mode === TrainingModeEnum.auto) return data.length * 5;
43+
if (mode === TrainingModeEnum.image) return data.length * 2;
4344
return data.length;
4445
};

packages/service/common/file/gridfs/controller.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ export const readFileContentFromMongo = async ({
223223
rawText: string;
224224
filename: string;
225225
}> => {
226-
const bufferId = `${fileId}-${customPdfParse}`;
226+
const bufferId = `${String(fileId)}-${customPdfParse}`;
227227
// read buffer
228228
const fileBuffer = await getRawTextBuffer(bufferId);
229229
if (fileBuffer) {

packages/service/common/file/gridfs/utils.ts

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,57 @@
11
import { detectFileEncoding } from '@fastgpt/global/common/file/tools';
22
import { PassThrough } from 'stream';
3+
import { getGridBucket } from './controller';
4+
import { type BucketNameEnum } from '@fastgpt/global/common/file/constants';
5+
import { retryFn } from '@fastgpt/global/common/system/utils';
6+
7+
export const createFileFromText = async ({
8+
bucket,
9+
filename,
10+
text,
11+
metadata
12+
}: {
13+
bucket: `${BucketNameEnum}`;
14+
filename: string;
15+
text: string;
16+
metadata: Record<string, any>;
17+
}) => {
18+
const gridBucket = getGridBucket(bucket);
19+
20+
const buffer = Buffer.from(text);
21+
22+
const fileSize = buffer.length;
23+
// 单块大小:尽可能大,但不超过 14MB,不小于128KB
24+
const chunkSizeBytes = (() => {
25+
// 计算理想块大小:文件大小 ÷ 目标块数(10)。 并且每个块需要小于 14MB
26+
const idealChunkSize = Math.min(Math.ceil(fileSize / 10), 14 * 1024 * 1024);
27+
28+
// 确保块大小至少为128KB
29+
const minChunkSize = 128 * 1024; // 128KB
30+
31+
// 取理想块大小和最小块大小中的较大值
32+
let chunkSize = Math.max(idealChunkSize, minChunkSize);
33+
34+
// 将块大小向上取整到最接近的64KB的倍数,使其更整齐
35+
chunkSize = Math.ceil(chunkSize / (64 * 1024)) * (64 * 1024);
36+
37+
return chunkSize;
38+
})();
39+
40+
const uploadStream = gridBucket.openUploadStream(filename, {
41+
metadata,
42+
chunkSizeBytes
43+
});
44+
45+
return retryFn(async () => {
46+
return new Promise<{ fileId: string }>((resolve, reject) => {
47+
uploadStream.end(buffer);
48+
uploadStream.on('finish', () => {
49+
resolve({ fileId: String(uploadStream.id) });
50+
});
51+
uploadStream.on('error', reject);
52+
});
53+
});
54+
};
355

456
export const gridFsStream2Buffer = (stream: NodeJS.ReadableStream) => {
557
return new Promise<Buffer>((resolve, reject) => {

projects/app/src/pageComponents/dataset/detail/Import/diffSource/FileCustomText.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ const CustomTextInput = () => {
4949
createStatus: 'waiting',
5050
rawText: data.value,
5151
sourceName: data.name,
52-
icon: 'file/fill/manual'
52+
icon: 'file/fill/txt'
5353
}
5454
]);
5555
goToNext();

projects/app/src/pages/api/core/dataset/collection/create/text.ts

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { DatasetCollectionTypeEnum } from '@fastgpt/global/core/dataset/constant
66
import { NextAPI } from '@/service/middleware/entry';
77
import { WritePermissionVal } from '@fastgpt/global/support/permission/constant';
88
import { type CreateCollectionResponse } from '@/global/core/dataset/api';
9+
import { createFileFromText } from '@fastgpt/service/common/file/gridfs/utils';
910

1011
async function handler(req: NextApiRequest): CreateCollectionResponse {
1112
const { name, text, ...body } = req.body as TextCreateDatasetCollectionParams;
@@ -18,16 +19,28 @@ async function handler(req: NextApiRequest): CreateCollectionResponse {
1819
per: WritePermissionVal
1920
});
2021

22+
// 1. Create file from text
23+
const filename = `${name}.txt`;
24+
const { fileId } = await createFileFromText({
25+
bucket: 'dataset',
26+
filename,
27+
text,
28+
metadata: {
29+
teamId,
30+
uid: tmbId
31+
}
32+
});
33+
2134
const { collectionId, insertResults } = await createCollectionAndInsertData({
2235
dataset,
2336
rawText: text,
2437
createCollectionParams: {
2538
...body,
2639
teamId,
2740
tmbId,
28-
type: DatasetCollectionTypeEnum.virtual,
29-
30-
name
41+
type: DatasetCollectionTypeEnum.file,
42+
fileId,
43+
name: filename
3144
}
3245
});
3346

projects/app/src/service/events/generateQA.ts renamed to projects/app/src/service/core/dataset/queues/generateQA.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/sch
22
import { pushQAUsage } from '@/service/support/wallet/usage/push';
33
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
44
import { createChatCompletion } from '@fastgpt/service/core/ai/config';
5-
import type { ChatCompletionMessageParam, StreamChatType } from '@fastgpt/global/core/ai/type.d';
5+
import type { ChatCompletionMessageParam } from '@fastgpt/global/core/ai/type.d';
66
import { addLog } from '@fastgpt/service/common/system/log';
77
import { splitText2Chunks } from '@fastgpt/global/common/string/textSplitter';
88
import { replaceVariable } from '@fastgpt/global/common/string/tools';

projects/app/src/service/events/utils.ts renamed to projects/app/src/service/core/dataset/queues/utils.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import { TeamErrEnum } from '@fastgpt/global/common/error/code/team';
22
import { checkTeamAIPoints } from '@fastgpt/service/support/permission/teamLimit';
3-
import { sendOneInform } from '../support/user/inform/api';
3+
import { sendOneInform } from '../../../support/user/inform/api';
44
import { lockTrainingDataByTeamId } from '@fastgpt/service/core/dataset/training/controller';
55
import { InformLevelEnum } from '@fastgpt/global/support/user/inform/constants';
66

@@ -18,7 +18,7 @@ export const checkTeamAiPointsAndLock = async (teamId: string) => {
1818
templateParam: {},
1919
teamId
2020
});
21-
console.log('余额不足,暂停【向量】生成任务');
21+
console.log('余额不足,暂停训练生成任务');
2222
await lockTrainingDataByTeamId(teamId);
2323
} catch (error) {}
2424
}

projects/app/src/service/core/dataset/training/utils.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { generateQA } from '@/service/events/generateQA';
2-
import { generateVector } from '@/service/events/generateVector';
1+
import { generateQA } from '@/service/core/dataset/queues/generateQA';
2+
import { generateVector } from '@/service/core/dataset/queues/generateVector';
33
import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants';
44
import { type DatasetTrainingSchemaType } from '@fastgpt/global/core/dataset/type';
55
import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema';

0 commit comments

Comments
 (0)