Merge pull request #26584 from Abdurrahheem:ash/fix-gpt2-sample

Update printingin GPT2 sample #26584 This PR update how GPT2 prints its output **Note**: As the length of the prompt increases while inference, the token generation time slows down. May be its right time to introduce QK cashing to speed up the inference ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
2025-08-06 14:36:36 +08:00 · 2024-12-16 14:43:47 +03:00 · 2024-12-16 14:43:47 +03:00 · 41df003d06
commit 41df003d06
parent 633ca0d6eb
1 changed files with 24 additions and 23 deletions
--- a/samples/dnn/gpt2_inference.py
+++ b/samples/dnn/gpt2_inference.py
@ -30,21 +30,17 @@ Run the script:
    python gpt2_inference.py --model=<path-to-onnx-model>  --prompt=<use-promt-of-the-same-length-used-while-exporting>
 '''

-
-
 import numpy as np
 import tiktoken
 import argparse
 import cv2 as cv
-from tqdm import tqdm

 def parse_args():
    parser = argparse.ArgumentParser(description='Use this script to run GPT-2 inference in OpenCV',
                                    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--model', type=str, required=True, help='Path to GPT-2 model ONNX model file.')
    parser.add_argument("--prompt", type=str, default="Hello, I'm a language model,", help="Prompt to start with.")
-    parser.add_argument("--max_seq_len", type=int, default=40, help="Number of tokens to continue.")
-    parser.add_argument("--batch_size", type=int, default=1, help="Number of batches.")
+    parser.add_argument("--max_seq_len", type=int, default=1024, help="Number of tokens to continue.")
    parser.add_argument("--seed", type=int, default=0, help="Random seed")
    return parser.parse_args()

@ -53,21 +49,27 @@ def stable_softmax(logits):
    return exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)


-def gpt2_inference(net, tokens, max_length, num_return_sequences=1):
+
+def gpt2_inference(net, tokens, max_length, tokenizer):

    print("Inferencing GPT-2 model...")
    x = np.array(tokens)
-    x = np.tile(x, (num_return_sequences, 1)).astype(np.int32)
+    x = np.tile(x, (1, 1)).astype(np.int32)
    pos = np.arange(0, len(x), dtype=np.int32)

-    counter = x.shape[1]
-    pbar = tqdm(total=max_length - counter, desc="Generating tokens")
-    while counter < max_length:
+    # warm up
+    net.setInputsNames(['input_ids', 'position_ids'])
+    net.setInput(x, 'input_ids')
+    net.setInput(pos, 'position_ids')
+    logits = net.forward()
+
+    stop_tokens = (50256, ) ## could be extended to include more stop tokens
+    print("\n", tokenizer.decode(tokens), sep="", end="")
+    while 0 < max_length and x[:, -1] not in stop_tokens:

        net.setInputsNames(['input_ids', 'position_ids'])
        net.setInput(x, 'input_ids')
        net.setInput(pos, 'position_ids')
-
        logits = net.forward()

        # logits is assumed to be (B, seq_length, vocab_size) and needs to be the last token's logits
@ -87,23 +89,27 @@ def gpt2_inference(net, tokens, max_length, num_return_sequences=1):
        sampled_indices = [np.random.choice(topk_indices[i], p=topk_probs[i]) for i in range(len(topk_probs))]
        sampled_indices = np.array(sampled_indices).reshape(-1, 1)

+        # Decode and print the new token
+        new_word = tokenizer.decode([sampled_indices[0, 0]])
+
+        ## clean the prints from the previous line
+        print(new_word, end='', flush=True)
+
        # Append to the sequence
        x = np.concatenate((x, sampled_indices), axis=1)
        pos = np.arange(0, x.shape[1], dtype=np.int32) # shape (T)

-        counter += 1
-        pbar.update(1)
+        max_length -= 1

-    pbar.close()
-    print("Inference done!")
-    return x
+    print('\n')

 if __name__ == '__main__':

    args = parse_args()
+    print("Preparing GPT-2 model...")
+
    np.random.seed(args.seed)
    max_length = args.max_seq_len
-    num_return_sequences = args.batch_size
    prompt = args.prompt

    net = cv.dnn.readNet(args.model)
@ -111,9 +117,4 @@ if __name__ == '__main__':
    enc = tiktoken.get_encoding('gpt2')
    tokens = enc.encode(prompt)

-    output = gpt2_inference(net, tokens, max_length, num_return_sequences)
-
-    for i in range(num_return_sequences):
-        tokens = output[i].tolist()
-        decoded = enc.decode(tokens)
-        print(">>>>", decoded)
+    gpt2_inference(net, tokens, max_length, enc)