Skip to content

Commit

Permalink
perf: token count;feat: chunk size
Browse files Browse the repository at this point in the history
  • Loading branch information
c121914yu committed Jun 23, 2023
1 parent 9aace87 commit ae1f7a8
Show file tree
Hide file tree
Showing 5 changed files with 86 additions and 149 deletions.
1 change: 0 additions & 1 deletion client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
"eventsource-parser": "^0.1.0",
"formidable": "^2.1.1",
"framer-motion": "^9.0.6",
"graphemer": "^1.4.0",
"hyperdown": "^2.4.29",
"immer": "^9.0.19",
"jsonwebtoken": "^9.0.0",
Expand Down
9 changes: 0 additions & 9 deletions client/pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

89 changes: 59 additions & 30 deletions client/src/pages/kb/components/SelectFileModal.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import React, { useState, useCallback } from 'react';
import React, { useState, useCallback, useRef } from 'react';
import {
Box,
Flex,
Expand All @@ -24,24 +24,10 @@ import { TrainingModeEnum } from '@/constants/plugin';
import { getErrText } from '@/utils/tools';
import { ChatModelMap, OpenAiChatEnum, embeddingPrice } from '@/constants/model';
import { formatPrice } from '@/utils/user';
import MySlider from '@/components/Slider';

const fileExtension = '.txt,.doc,.docx,.pdf,.md';

const modeMap = {
[TrainingModeEnum.qa]: {
maxLen: 8000,
slideLen: 3000,
price: ChatModelMap[OpenAiChatEnum.GPT3516k].price,
isPrompt: true
},
[TrainingModeEnum.index]: {
maxLen: 1000,
slideLen: 500,
price: embeddingPrice,
isPrompt: false
}
};

const SelectFileModal = ({
onClose,
onSuccess,
Expand All @@ -51,6 +37,16 @@ const SelectFileModal = ({
onSuccess: () => void;
kbId: string;
}) => {
const [modeMap, setModeMap] = useState({
[TrainingModeEnum.qa]: {
maxLen: 8000,
price: ChatModelMap[OpenAiChatEnum.GPT3516k].price
},
[TrainingModeEnum.index]: {
maxLen: 600,
price: embeddingPrice
}
});
const [btnLoading, setBtnLoading] = useState(false);
const { toast } = useToast();
const [prompt, setPrompt] = useState('');
Expand Down Expand Up @@ -200,7 +196,7 @@ const SelectFileModal = ({
});
}
setBtnLoading(false);
}, [files, mode, mutate, openConfirm, toast]);
}, [files, mode, modeMap, mutate, openConfirm, toast]);

return (
<Modal isOpen={true} onClose={onClose} isCentered>
Expand Down Expand Up @@ -244,19 +240,52 @@ const SelectFileModal = ({
/>
</Flex>
{/* 内容介绍 */}
{modeMap[mode].isPrompt && (
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
<Box flex={'0 0 70px'} mr={2}>
下面是
</Box>
<Input
placeholder="提示词,例如: Laf的介绍/关于gpt4的论文/一段长文本"
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
size={'sm'}
/>
</Flex>
)}
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
{mode === TrainingModeEnum.qa && (
<>
<Box flex={'0 0 70px'} mr={2}>
下面是
</Box>
<Input
placeholder="提示词,例如: Laf的介绍/关于gpt4的论文/一段长文本"
value={prompt}
onChange={(e) => setPrompt(e.target.value)}
size={'sm'}
/>
</>
)}
{/* chunk size */}
{mode === TrainingModeEnum.index && (
<Flex w={'100%'} px={5} alignItems={'center'} mt={4}>
<Box w={['70px']} flexShrink={0}>
段落长度
</Box>
<Box flex={1} ml={'10px'}>
<MySlider
markList={[
{ label: '300', value: 300 },
{ label: '1000', value: 1000 }
]}
width={['100%', '260px']}
min={300}
max={1000}
step={50}
activeVal={modeMap[TrainingModeEnum.index].maxLen}
setVal={(val) => {
setModeMap((state) => ({
...state,
[TrainingModeEnum.index]: {
maxLen: val,
price: embeddingPrice
}
}));
}}
/>
</Box>
</Flex>
)}
</Flex>

{/* 文本内容 */}
<Box flex={'1 0 0'} px={5} h={0} w={'100%'} overflowY={'auto'} mt={4}>
{files.slice(0, 100).map((item, i) => (
Expand Down
12 changes: 3 additions & 9 deletions client/src/utils/file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -148,15 +148,9 @@ export const fileDownload = ({
* slideLen - The size of the before and after Text
* maxLen > slideLen
*/
export const splitText_token = ({
text,
maxLen,
slideLen
}: {
text: string;
maxLen: number;
slideLen: number;
}) => {
export const splitText_token = ({ text, maxLen }: { text: string; maxLen: number }) => {
const slideLen = Math.floor(maxLen * 0.3);

try {
const enc = getOpenAiEncMap()[OpenAiChatEnum.GPT35];
// filter empty text. encode sentence
Expand Down
124 changes: 24 additions & 100 deletions client/src/utils/plugin/openai.ts
Original file line number Diff line number Diff line change
@@ -1,68 +1,20 @@
import { encoding_for_model, type Tiktoken } from '@dqbd/tiktoken';
import { encoding_for_model } from '@dqbd/tiktoken';
import type { ChatItemType } from '@/types/chat';
import { ChatRoleEnum } from '@/constants/chat';
import { type ChatCompletionRequestMessage, ChatCompletionRequestMessageRoleEnum } from 'openai';
import { ChatCompletionRequestMessageRoleEnum } from 'openai';
import { OpenAiChatEnum } from '@/constants/model';
import Graphemer from 'graphemer';
import axios from 'axios';
import dayjs from 'dayjs';
import type { MessageItemType } from '@/pages/api/openapi/v1/chat/completions';

const textDecoder = new TextDecoder();
const graphemer = new Graphemer();

export const getOpenAiEncMap = () => {
if (typeof window !== 'undefined') {
window.OpenAiEncMap = window.OpenAiEncMap || {
[OpenAiChatEnum.GPT35]: encoding_for_model('gpt-3.5-turbo', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
}),
[OpenAiChatEnum.GPT3516k]: encoding_for_model('gpt-3.5-turbo', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
}),
[OpenAiChatEnum.GPT4]: encoding_for_model('gpt-4', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
}),
[OpenAiChatEnum.GPT432k]: encoding_for_model('gpt-4-32k', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
})
};
if (typeof window !== 'undefined' && window.OpenAiEncMap) {
return window.OpenAiEncMap;
}
if (typeof global !== 'undefined') {
global.OpenAiEncMap = global.OpenAiEncMap || {
[OpenAiChatEnum.GPT35]: encoding_for_model('gpt-3.5-turbo', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
}),
[OpenAiChatEnum.GPT3516k]: encoding_for_model('gpt-3.5-turbo', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
}),
[OpenAiChatEnum.GPT4]: encoding_for_model('gpt-4', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
}),
[OpenAiChatEnum.GPT432k]: encoding_for_model('gpt-4-32k', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266
})
};
if (typeof global !== 'undefined' && global.OpenAiEncMap) {
return global.OpenAiEncMap;
}
return {
const enc = {
[OpenAiChatEnum.GPT35]: encoding_for_model('gpt-3.5-turbo', {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
Expand All @@ -84,6 +36,15 @@ export const getOpenAiEncMap = () => {
'<|im_sep|>': 100266
})
};

if (typeof window !== 'undefined') {
window.OpenAiEncMap = enc;
}
if (typeof global !== 'undefined') {
global.OpenAiEncMap = enc;
}

return enc;
};

export const adaptChatItem_openAI = ({
Expand Down Expand Up @@ -112,55 +73,18 @@ export function countOpenAIToken({
messages: ChatItemType[];
model: `${OpenAiChatEnum}`;
}) {
function getChatGPTEncodingText(
messages: ChatCompletionRequestMessage[],
model: `${OpenAiChatEnum}`
) {
const isGpt3 = model.startsWith('gpt-3.5-turbo');

const msgSep = isGpt3 ? '\n' : '';
const roleSep = isGpt3 ? '\n' : '<|im_sep|>';

return [
messages
.map(({ name = '', role, content }) => {
return `<|im_start|>${name || role}${roleSep}${content}<|im_end|>`;
})
.join(msgSep),
`<|im_start|>assistant${roleSep}`
].join(msgSep);
}
function text2TokensLen(encoder: Tiktoken, inputText: string) {
const encoding = encoder.encode(inputText, 'all');
const segments: { text: string; tokens: { id: number; idx: number }[] }[] = [];

let byteAcc: number[] = [];
let tokenAcc: { id: number; idx: number }[] = [];
let inputGraphemes = graphemer.splitGraphemes(inputText);

for (let idx = 0; idx < encoding.length; idx++) {
const token = encoding[idx]!;
byteAcc.push(...encoder.decode_single_token_bytes(token));
tokenAcc.push({ id: token, idx });

const segmentText = textDecoder.decode(new Uint8Array(byteAcc));
const graphemes = graphemer.splitGraphemes(segmentText);

if (graphemes.every((item, idx) => inputGraphemes[idx] === item)) {
segments.push({ text: segmentText, tokens: tokenAcc });

byteAcc = [];
tokenAcc = [];
inputGraphemes = inputGraphemes.slice(graphemes.length);
}
}

return segments.reduce((memo, i) => memo + i.tokens.length, 0) ?? 0;
}
const diffVal = model.startsWith('gpt-3.5-turbo') ? 3 : 2;

const adaptMessages = adaptChatItem_openAI({ messages, reserveId: true });

return text2TokensLen(getOpenAiEncMap()[model], getChatGPTEncodingText(adaptMessages, model));
const token = adaptMessages.reduce((sum, item) => {
const text = `${item.role}\n${item.content}`;
const enc = getOpenAiEncMap()[model];
const encodeText = enc.encode(text);
const tokens = encodeText.length + diffVal;
return sum + tokens;
}, 0);

return token;
}

export const openAiSliceTextByToken = ({
Expand Down

0 comments on commit ae1f7a8

Please sign in to comment.