@@ -119,9 +119,9 @@ GenerationInput::TensorPtr tensorrtllm::getTensorSingleStopWordList(int stopToke
119119
120120GenerationInput::TensorPtr tensorrtllm::getTensorChatMLStopWordList ()
121121{
122- std::vector<int32_t > stopWordsTokens = {28789 , 28766 , 321 , 28730 , 416 , 28766 , 28767 , 2 , 32000 , 7 , 8 , 9 , -1 , -1 , -1 ,
123- -1 , -1 , - 1 }; // Extend with -1 for increased length
124- return gptSession->getBufferManager ().copyFrom (stopWordsTokens, ITensor::makeShape ({1 , 2 , 9 }), MemoryType::kGPU );
122+ std::vector<int32_t > stopWordsTokens = { 28766 , 321 , 28730 , 416 , 28766 , 28767 , 2 , 32000 , 6 , 7 , 8 , -1 , -1 , -1 ,
123+ -1 , -1 }; // Extend with -1 for increased length
124+ return gptSession->getBufferManager ().copyFrom (stopWordsTokens, ITensor::makeShape ({1 , 2 , 8 }), MemoryType::kGPU );
125125}
126126
127127GenerationInput tensorrtllm::createGenerationInput (std::vector<int32_t > inputIdsHost)
@@ -189,6 +189,7 @@ void inferenceThread(std::shared_ptr<inferenceState> inferState, std::vector<int
189189 // Valid prevPos, proceed with slicing the string from prevPos to the end
190190 std::string stringTok (text.begin () + inferState->prevPos , text.end ());
191191 std::lock_guard<std::mutex> guard (inferState->queueMutex ); // Protect access with a lock
192+ std::cout << stringTok << std::endl;
192193 inferState->textsToStream .push (stringTok);
193194 }
194195 else if (inferState->prevPos >= text.size ())
0 commit comments