Skip to content

Commit

Permalink
llama : fix bpe tokenize from byte (ggerganov#2889)
Browse files Browse the repository at this point in the history
  • Loading branch information
opparco authored Sep 3, 2023
1 parent d9151e6 commit 3730134
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3366,9 +3366,15 @@ struct llm_tokenizer_bpe {
std::string byte_str(1, *j);
auto token_multibyte = vocab.token_to_id.find(byte_str);
if (token_multibyte == vocab.token_to_id.end()) {
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
try {
llama_token token_byte = llama_byte_to_token(vocab, *j);
output.push_back(token_byte);
} catch (const std::out_of_range & err) {
fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
}
} else {
output.push_back((*token_multibyte).second);
}
output.push_back((*token_multibyte).second);
}
} else {
output.push_back((*token).second);
Expand Down

0 comments on commit 3730134

Please sign in to comment.