Skip to content

Commit

Permalink
Optimize the file storage structure of the knowledge base (labring#386)
Browse files Browse the repository at this point in the history
  • Loading branch information
c121914yu authored Oct 10, 2023
1 parent 29d1527 commit d0041a9
Show file tree
Hide file tree
Showing 41 changed files with 591 additions and 231 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"editor.formatOnSave": true,
"editor.mouseWheelZoom": true,
"typescript.tsdk": "node_modules/typescript/lib",
"editor.defaultFormatter": "esbenp.prettier-vscode",
"prettier.prettierPath": "./node_modules/prettier",
"i18n-ally.localesPaths": [
"projects/app/public/locales"
],
Expand Down
2 changes: 1 addition & 1 deletion docSite/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
## 本地运行

1. 安装 go 语言环境。
2. 安装 hugo。 [二进制下载](https://github.com/gohugoio/hugo/releases/tag/v0.117.0)
2. 安装 hugo。 [二进制下载](https://github.com/gohugoio/hugo/releases/tag/v0.117.0),注意需要安装 extended 版本。
3. cd docSite
4. hugo serve
5. 访问 http://localhost:1313
8 changes: 8 additions & 0 deletions docSite/content/docs/development/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,14 @@ weight: 520
"maxToken": 16000,
"price": 0,
"prompt": ""
},
"QGModel": { // 生成下一步指引模型
"model": "gpt-3.5-turbo",
"name": "GPT35-4k",
"maxToken": 4000,
"price": 0,
"prompt": "",
"functionCall": false
}
}
```
8 changes: 8 additions & 0 deletions docSite/content/docs/development/design/_index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
weight: 540
title: "设计方案"
description: "FastGPT 部分设计方案"
icon: public
draft: false
images: []
---
25 changes: 25 additions & 0 deletions docSite/content/docs/development/design/dataset.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
weight: 541
title: "数据集"
description: "FastGPT 数据集中文件与数据的设计方案"
icon: dataset
draft: false
images: []
---

## 文件与数据的关系

在 FastGPT 中,文件会通过 MongoDB 的 FS 存储,而具体的数据会通过 PostgreSQL 存储,PG 中的数据会有一列 file_id,关联对应的文件。考虑到旧版本的兼容,以及手动输入、标注数据等,我们给 file_id 增加了一些特殊的值,如下:

- manual: 手动输入
- mark: 手动标注的数据

注意,file_id 仅在插入数据时会写入,变更时无法修改。

## 文件导入流程

1. 上传文件到 MongoDB 的 FS 中,获取 file_id,此时文件标记为 `unused` 状态
2. 浏览器解析文件,获取对应的文本和 chunk
3. 给每个 chunk 打上 file_id
4. 点击上传数据:将文件的状态改为 `used`,并将数据推送到 mongo `training` 表中等待训练
5. 由训练线程从 mongo 中取数据,并在获取向量后插入到 pg。
29 changes: 29 additions & 0 deletions docSite/content/docs/installation/upgrading/447.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
---
title: 'V4.4.7'
description: 'FastGPT V4.4.7 更新(需执行升级脚本)'
icon: 'upgrade'
draft: false
toc: true
weight: 840
---

## 执行初始化 API

发起 1 个 HTTP 请求({{rootkey}} 替换成环境变量里的`rootkey`,{{host}}替换成自己域名)

1. https://xxxxx/api/admin/initv445

```bash
curl --location --request POST 'https://{{host}}/api/admin/initv447' \
--header 'rootkey: {{rootkey}}' \
--header 'Content-Type: application/json'
```

初始化 pg 索引以及将 file_id 中空对象转成 manual 对象。如果数据多,可能需要较长时间,可以通过日志查看进度。

## 功能介绍

### Fast GPT V4.4.7

1. 优化了数据库文件 crud。
2. 兼容链接读取,作为 source。
23 changes: 23 additions & 0 deletions packages/common/tools/file.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { strIsLink } from './str';

export const fileImgs = [
{ suffix: 'pdf', src: '/imgs/files/pdf.svg' },
{ suffix: 'csv', src: '/imgs/files/csv.svg' },
{ suffix: '(doc|docs)', src: '/imgs/files/doc.svg' },
{ suffix: 'txt', src: '/imgs/files/txt.svg' },
{ suffix: 'md', src: '/imgs/files/markdown.svg' },
{ suffix: '.', src: '/imgs/files/file.svg' }
];

export function getFileIcon(name = '') {
return fileImgs.find((item) => new RegExp(item.suffix, 'gi').test(name))?.src;
}
export function getSpecialFileIcon(name = '') {
if (name === 'manual') {
return '/imgs/files/manual.svg';
} else if (name === 'mark') {
return '/imgs/files/mark.svg';
} else if (strIsLink(name)) {
return '/imgs/files/link.svg';
}
}
5 changes: 5 additions & 0 deletions packages/common/tools/str.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export function strIsLink(str?: string) {
if (!str) return false;
if (/^((http|https)?:\/\/|www\.|\/)[^\s/$.?#].[^\s]*$/i.test(str)) return true;
return false;
}
15 changes: 15 additions & 0 deletions packages/core/dataset/constant.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
export enum DatasetSpecialIdEnum {
manual = 'manual',
mark = 'mark'
}
export const datasetSpecialIdMap = {
[DatasetSpecialIdEnum.manual]: {
name: 'kb.Manual Data',
sourceName: 'kb.Manual Input'
},
[DatasetSpecialIdEnum.mark]: {
name: 'kb.Mark Data',
sourceName: 'kb.Manual Mark'
}
};
export const datasetSpecialIds: string[] = [DatasetSpecialIdEnum.manual, DatasetSpecialIdEnum.mark];
8 changes: 8 additions & 0 deletions packages/core/dataset/utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import { datasetSpecialIds } from './constant';
import { strIsLink } from '@fastgpt/common/tools/str';

export function isSpecialFileId(id: string) {
if (datasetSpecialIds.includes(id)) return true;
if (strIsLink(id)) return true;
return false;
}
4 changes: 3 additions & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
"version": "1.0.0",
"dependencies": {
"openai": "^3.3.0",
"tunnel": "^0.0.6"
"tunnel": "^0.0.6",
"@fastgpt/common": "workspace:*",
"@fastgpt/support": "workspace:*"
},
"devDependencies": {
"@types/tunnel": "^0.0.4"
Expand Down
5 changes: 4 additions & 1 deletion packages/support/package.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
{
"name": "@fastgpt/support",
"version": "1.0.0"
"version": "1.0.0",
"dependencies": {
"@fastgpt/common": "workspace:*"
}
}
12 changes: 11 additions & 1 deletion pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion projects/app/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "app",
"version": "4.4.6",
"version": "4.4.7",
"private": false,
"scripts": {
"dev": "next dev",
Expand Down
6 changes: 3 additions & 3 deletions projects/app/public/docs/chatProblem.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@
- [计费规则](https://doc.fastgpt.run/docs/pricing/)

**其他问题**
| 交流群 | 小助手 |
| ----------------------- | -------------------- |
| ![](https://otnvvf-imgs.oss.laf.run/wxqun300.jpg) | ![](https://otnvvf-imgs.oss.laf.run/wx300.jpg) |
| 添加小助手进入交流群 |
| ----------------------- |
| ![](https://otnvvf-imgs.oss.laf.run/wx300.jpg) |
1 change: 1 addition & 0 deletions projects/app/public/imgs/files/link.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions projects/app/public/imgs/files/manual.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions projects/app/public/imgs/files/mark.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
8 changes: 7 additions & 1 deletion projects/app/public/locales/en/common.json
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@
"Output": "Output",
"Password inconsistency": "Password inconsistency",
"Rename": "Rename",
"Rename Failed": "Rename Failed",
"Rename Success": "Rename Success",
"Search": "Search",
"Status": "Status",
"Update Successful": "Update Successful",
Expand Down Expand Up @@ -214,10 +216,14 @@
"Filename": "Filename",
"Files": "{{total}} Files",
"Folder Name": "Input folder name",
"Insert Data": "Insert",
"Manual Data": "Manual Data",
"Manual Input": "Manual Input",
"Manual Mark": "Manual Mark",
"Mark Data": "Mark Data",
"Move Failed": "Move Failed",
"My Dataset": "My Dataset",
"No Folder": "No Folder",
"Other Data": "Other Data",
"Select Dataset": "Select Dataset",
"Select Folder": "Enter folder",
"Upload Time": "Upload Time",
Expand Down
8 changes: 7 additions & 1 deletion projects/app/public/locales/zh/common.json
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@
"Output": "输出",
"Password inconsistency": "两次密码不一致",
"Rename": "重命名",
"Rename Failed": "重命名失败",
"Rename Success": "重命名成功",
"Search": "搜索",
"Status": "状态",
"Update Successful": "更新成功",
Expand Down Expand Up @@ -214,10 +216,14 @@
"Filename": "文件名",
"Files": "文件: {{total}}个",
"Folder Name": "输入文件夹名称",
"Insert Data": "插入",
"Manual Data": "手动录入",
"Manual Input": "手动录入",
"Manual Mark": "手动标注",
"Mark Data": "标注数据",
"Move Failed": "移动出现错误~",
"My Dataset": "我的知识库",
"No Folder": "没有子目录了~",
"Other Data": "其他数据",
"Select Dataset": "选择该知识库",
"Select Folder": "进入文件夹",
"Upload Time": "上传时间",
Expand Down
3 changes: 2 additions & 1 deletion projects/app/src/components/ChatBox/ResponseTags.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ const ResponseTags = ({ responseData = [] }: { responseData?: ChatHistoryItemRes
quoteList: responseData
.filter((item) => item.moduleType === FlowModuleTypeEnum.chatNode)
.map((item) => item.quoteList)
.flat(),
.flat()
.filter((item) => item) as QuoteItemType[],
historyPreview: chatData?.historyPreview,
runningTime: +responseData.reduce((sum, item) => sum + (item.runningTime || 0), 0).toFixed(2)
};
Expand Down
11 changes: 10 additions & 1 deletion projects/app/src/components/ChatBox/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ import styles from './index.module.scss';
import Script from 'next/script';
import { postQuestionGuide } from '@/api/core/ai/agent/api';
import { splitGuideModule } from './utils';
import { DatasetSpecialIdEnum } from '@fastgpt/core/dataset/constant';

const nanoid = customAlphabet('abcdefghijklmnopqrstuvwxyz1234567890', 24);

Expand Down Expand Up @@ -511,13 +512,20 @@ const ChatBox = (

// add guide text listener
useEffect(() => {
const windowMessage = ({ data }: MessageEvent<{ type: 'sendPrompt'; text: string }>) => {
if (data?.type === 'sendPrompt' && data?.text) {
handleSubmit((item) => sendPrompt(item, data.text))();
}
};
window.addEventListener('message', windowMessage);
event.on('guideClick', ({ text }: { text: string }) => {
if (!text) return;
handleSubmit((data) => sendPrompt(data, text))();
});

return () => {
event.off('guideClick');
window.removeEventListener('message', windowMessage);
};
}, [handleSubmit, sendPrompt]);

Expand Down Expand Up @@ -995,7 +1003,8 @@ const ChatBox = (
defaultValues={{
dataId: adminMarkData.dataId,
q: adminMarkData.q,
a: adminMarkData.a
a: adminMarkData.a,
file_id: DatasetSpecialIdEnum.mark
}}
/>
)}
Expand Down
9 changes: 0 additions & 9 deletions projects/app/src/constants/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,6 @@ export enum UserAuthTypeEnum {
findPassword = 'findPassword'
}

export const fileImgs = [
{ suffix: 'pdf', src: '/imgs/files/pdf.svg' },
{ suffix: 'csv', src: '/imgs/files/csv.svg' },
{ suffix: '(doc|docs)', src: '/imgs/files/doc.svg' },
{ suffix: 'txt', src: '/imgs/files/txt.svg' },
{ suffix: 'md', src: '/imgs/files/markdown.svg' },
{ suffix: '.', src: '/imgs/files/file.svg' }
];

export enum TrackEventName {
windowError = 'windowError',
pageError = 'pageError',
Expand Down
1 change: 0 additions & 1 deletion projects/app/src/constants/dataset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,3 @@ export const KbTypeMap = {
};

export const FolderAvatarSrc = '/imgs/files/folder.svg';
export const OtherFileId = 'other';
Loading

0 comments on commit d0041a9

Please sign in to comment.