From 6a6d2fe8193acf68a38107a5f1f9b3800ec2fca1 Mon Sep 17 00:00:00 2001 From: Jing Hua Date: Sat, 25 Mar 2023 22:49:16 +0800 Subject: [PATCH] only load cl100k tokeniser fix #109, #13 --- src/utils/messageUtils.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/utils/messageUtils.ts b/src/utils/messageUtils.ts index 21f0ec4..b971e5b 100644 --- a/src/utils/messageUtils.ts +++ b/src/utils/messageUtils.ts @@ -1,6 +1,7 @@ import { MessageInterface, ModelOptions } from '@type/chat'; -import { encoding_for_model } from '@dqbd/tiktoken'; +import { Tiktoken } from '@dqbd/tiktoken/lite'; +const cl100k_base = await import('@dqbd/tiktoken/encoders/cl100k_base.json'); // https://github.com/dqbd/tiktoken/issues/23#issuecomment-1483317174 export const getChatGPTEncoding = ( @@ -9,11 +10,16 @@ export const getChatGPTEncoding = ( ) => { const isGpt3 = model === 'gpt-3.5-turbo'; - const encoder = encoding_for_model(model, { - '<|im_start|>': 100264, - '<|im_end|>': 100265, - '<|im_sep|>': 100266, - }); + const encoder = new Tiktoken( + cl100k_base.bpe_ranks, + { + ...cl100k_base.special_tokens, + '<|im_start|>': 100264, + '<|im_end|>': 100265, + '<|im_sep|>': 100266, + }, + cl100k_base.pat_str + ); const msgSep = isGpt3 ? '\n' : ''; const roleSep = isGpt3 ? '\n' : '<|im_sep|>';