only load cl100k tokeniser

fix #109, #13
This commit is contained in:
Jing Hua 2023-03-25 22:49:16 +08:00
parent 8ff5079266
commit 6a6d2fe819

View file

@ -1,6 +1,7 @@
import { MessageInterface, ModelOptions } from '@type/chat';
import { encoding_for_model } from '@dqbd/tiktoken';
import { Tiktoken } from '@dqbd/tiktoken/lite';
const cl100k_base = await import('@dqbd/tiktoken/encoders/cl100k_base.json');
// https://github.com/dqbd/tiktoken/issues/23#issuecomment-1483317174
export const getChatGPTEncoding = (
@ -9,11 +10,16 @@ export const getChatGPTEncoding = (
) => {
const isGpt3 = model === 'gpt-3.5-turbo';
const encoder = encoding_for_model(model, {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266,
});
const encoder = new Tiktoken(
cl100k_base.bpe_ranks,
{
...cl100k_base.special_tokens,
'<|im_start|>': 100264,
'<|im_end|>': 100265,
'<|im_sep|>': 100266,
},
cl100k_base.pat_str
);
const msgSep = isGpt3 ? '\n' : '';
const roleSep = isGpt3 ? '\n' : '<|im_sep|>';