classTokenCounter(object): def__init__(self, model="gpt-3.5-turbo"): """ :param model: name of model, type: string """ self.model = model
defcount(self, _input: Union[List, str]) -> Union[List[int], int]: """ :param _input: user input, type: str or List[str] :return: Return the number of tokens used by text, type int or List[int] """ try: encoding = tiktoken.encoding_for_model(self.model) except KeyError: print("Warning: model not found. Using cl100k_base encoding.") encoding = tiktoken.get_encoding("cl100k_base")
ifisinstance(_input, list): token_count_list = [] for text in _input: token_count_list.append(len(encoding.encode(text))) return token_count_list elifisinstance(_input, str): returnlen(encoding.encode(_input)) else: raise NotImplementedError(f"not support data type for {type(_input)}, please use str or List[str].")
deftest_case1(self): text = "who are you?" tokens_cnt = self.token_cnt.count(_input=text) self.assertEqual(tokens_cnt, 4)
deftest_case2(self): texts = ["who are you?", "How's it going on?"] tokens_cnt = self.token_cnt.count(_input=texts) self.assertEqual(tokens_cnt, [4, 6])
deftest_case3(self): with self.assertRaises(NotImplementedError) as cm: self.token_cnt.count(_input=23) the_exception = cm.exception self.assertEqual(the_exception.__str__(), "not support data type for <class 'int'>, please use str or List[str].")
if __name__ == '__main__': suite = unittest.TestSuite() suite.addTest(TestTokenCounter('test_case1')) suite.addTest(TestTokenCounter('test_case2')) suite.addTest(TestTokenCounter('test_case3')) run = unittest.TextTestRunner() run.run(suite)