Merge pull request #25868 from Abdurrahheem:ash/add-gpt2-sample

Add sample for GPT2 inference #25868 ### Pull Request Readiness Checklist This PR adds sample for inferencing GPT-2 model. More specificly implementation of GPT-2 from [this repository](https://github.com/karpathy/build-nanogpt). Currently inference in OpenCV is only possible to do with fixed window size due to not supported dynamic shapes. See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMake
2025-07-24 14:06:27 +08:00 · 2024-07-18 16:47:12 +03:00 · 2024-07-18 16:47:12 +03:00 · 88f05e49be
commit 88f05e49be
parent 060c24bec9
4 changed files with 152 additions and 2 deletions
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@ -728,7 +728,7 @@ CV__DNN_INLINE_NS_BEGIN
         *  @param outLayerShapes output parameter for output layers shapes;
         * order is the same as in layersIds
         */
-        void getLayerShapes(const MatShape& netInputShape,
+        CV_WRAP void getLayerShapes(const MatShape& netInputShape,
                                    const int& netInputType,
                                    const int layerId,
                                    CV_OUT std::vector<MatShape>& inLayerShapes,
--- a/modules/java/generator/src/cpp/listconverters.cpp
+++ b/modules/java/generator/src/cpp/listconverters.cpp
@ -108,3 +108,21 @@ void Copy_vector_string_to_List(JNIEnv* env, std::vector<std::string>& vs, jobje
        env->DeleteLocalRef(element);
    }
 }
+
+#ifdef HAVE_OPENCV_DNN
+void Copy_vector_MatShape_to_List(JNIEnv* env, std::vector<cv::dnn::MatShape>& vs, jobject list)
+{
+    static jclass juArrayList       = ARRAYLIST(env);
+    jmethodID m_clear     = LIST_CLEAR(env, juArrayList);
+    jmethodID m_add       = LIST_ADD(env, juArrayList);
+
+    env->CallVoidMethod(list, m_clear);
+    for (size_t i = 0; i < vs.size(); i++)
+    {
+        jintArray element = env->NewIntArray((jint)vs[i].size());
+        env->SetIntArrayRegion(element, 0, (jint)vs[i].size(), (const jint*)&vs[i][0]);
+        env->CallBooleanMethod(list, m_add, element);
+        env->DeleteLocalRef(element);
+    }
+}
+#endif // HAVE_OPENCV_DNN
--- a/modules/java/generator/src/cpp/listconverters.hpp
+++ b/modules/java/generator/src/cpp/listconverters.hpp
@ -23,4 +23,11 @@ std::vector<std::string> List_to_vector_string(JNIEnv* env, jobject list);

 void Copy_vector_string_to_List(JNIEnv* env, std::vector<std::string>& vs, jobject list);

-#endif	/* LISTCONVERTERS_HPP */
+#ifdef HAVE_OPENCV_DNN
+#include "opencv2/dnn.hpp"
+
+void Copy_vector_MatShape_to_List(JNIEnv* env, std::vector<cv::dnn::MatShape>& vs, jobject list);
+
+#endif // HAVE_OPENCV_DNN
+
+#endif /* LISTCONVERTERS_HPP */
--- a/samples/dnn/gpt2_inference.py
+++ b/samples/dnn/gpt2_inference.py
@ -0,0 +1,125 @@
+'''
+This is a sample script to run GPT-2 inference in OpenCV using ONNX model.
+The script loads the GPT-2 model and runs inference on a given prompt.
+Currently script only works with fixed size window, that means
+you will have to specify prompt of the same length as when model was exported to ONNX.
+
+
+Exporting GPT-2 model to ONNX.
+To export GPT-2 model to ONNX, you can use the following procedure:
+
+1. Clone fork of Andrej Karpathy's GPT-2 repository:
+
+    git clone https://github.com/Abdurrahheem/build-nanogpt/tree/ash/export-gpt2-onnx
+
+2. Install the required dependencies:
+
+    pip install -r requirements.txt
+
+3  Export the model to ONNX:
+
+    python export2onnx.py --promt=<Any-promt-you-want> --batch_size=<batch-size>
+
+
+Run the script:
+1. Install the required dependencies:
+
+    pip install tiktoken==0.7.0
+
+2. Run the script:
+
+    python gpt2_inference.py --model=<path-to-onnx-model> --max_seq_len=<max-output-lenght> --batch_size=<use-one-used-while-exportinh> --prompt=<use-promt-of-the-same-length-used-while-exporting>
+'''
+
+
+
+from copy import deepcopy
+import numpy as np
+import tiktoken
+import argparse
+import cv2 as cv
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Use this script to run GPT-2 inference in OpenCV',
+                                    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--model', type=str, required=True, help='Path to GPT-2 model ONNX model file.')
+    parser.add_argument("--max_seq_len", type=int, default=30, help="Number of tokens to continue.")
+    parser.add_argument("--batch_size", type=int, default=5, help="Number of batches.")
+    parser.add_argument("--prompt", type=str, default="Hello, I'm a language model,", help="Prompt to start with.")
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    return parser.parse_args()
+
+def stable_softmax(logits):
+    exp_logits = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
+    return exp_logits / np.sum(exp_logits, axis=-1, keepdims=True)
+
+
+def gpt2_inference(net, tokens, max_length, num_return_sequences=5):
+
+    print("Inferencing GPT-2 model...")
+    x = np.array(tokens)
+    x = np.tile(x, (num_return_sequences, 1))
+
+    output_buffer = deepcopy(x)
+    counter = x.shape[1]
+    while counter < max_length:
+
+        net.setInput(x)
+        logits = net.forward()
+
+        # logits is assumed to be (B, seq_length, vocab_size) and needs to be the last token's logits
+        logits = logits[:, -1, :]  # (B, vocab_size)
+
+        # Get the probabilities using softmax
+        probs = stable_softmax(logits)
+
+        # Do top-k sampling of 50
+        topk_indices = np.argpartition(probs, -50, axis=-1)[:, -50:]
+        topk_probs = np.take_along_axis(probs, topk_indices, axis=-1)
+
+        # Normalize top-k probabilities
+        topk_probs /= np.sum(topk_probs, axis=-1, keepdims=True)
+
+        # Select a token from the top-k probabilities
+        sampled_indices = [np.random.choice(topk_indices[i], p=topk_probs[i]) for i in range(len(topk_probs))]
+        sampled_indices = np.array(sampled_indices).reshape(-1, 1)
+
+        # Append to the sequence
+        x = np.concatenate((x, sampled_indices), axis=1)
+        x = x[:, 1:] ## issue due to fixes size window in opencv
+
+        output_buffer = np.concatenate((output_buffer, sampled_indices), axis=1)
+        counter += 1
+    print("Inference done!")
+    return output_buffer
+
+if __name__ == '__main__':
+
+    args = parse_args()
+    np.random.seed(args.seed)
+    max_length = args.max_seq_len
+    num_return_sequences = args.batch_size
+    prompt = args.prompt
+
+    net = cv.dnn.readNet(args.model)
+    input_token_size = net.getLayerShapes([], 0, 0)[0][0][1]
+
+    enc = tiktoken.get_encoding('gpt2')
+    tokens = enc.encode(prompt)
+
+    # Check if the prompt is of the same length as the input tokens
+    # if not, pad the tokens else truncate the tokens
+    if len(tokens) > input_token_size:
+        tokens = tokens[:input_token_size]
+    elif len(tokens) < input_token_size:
+        tokens2pad = input_token_size - len(tokens)
+        # append <space> token to the prompt
+        tokens += [220] * tokens2pad
+
+
+    output_buffer = gpt2_inference(net, tokens, max_length, num_return_sequences)
+
+    for i in range(num_return_sequences):
+        tokens = output_buffer[i, :max_length].tolist()
+        decoded = enc.decode(tokens)
+        print(">>>>", decoded)