Replace strcpy and strncpy by new inline helper function

Signed-off-by: Stefan Weil <sw@weilnetz.de>
2024-11-27 12:49:35 +08:00 · 2024-05-23 23:27:50 +02:00 · 2024-05-23 23:27:50 +02:00 · c5b0c2f421
commit c5b0c2f421
parent ea82f919a6
10 changed files with 33 additions and 55 deletions
--- a/src/api/altorenderer.cpp
+++ b/src/api/altorenderer.cpp
@ -14,6 +14,7 @@
 // limitations under the License.

 #include "errcode.h" // for ASSERT_HOST
+#include "helpers.h" // for copy_string
 #ifdef _WIN32
 #  include "host.h"  // windows.h for MultiByteToWideChar, ...
 #endif
@ -270,12 +271,9 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {

  alto_str << "\t\t\t</PrintSpace>\n"
           << "\t\t</Page>\n";
-  const std::string &text = alto_str.str();

-  char *result = new char[text.length() + 1];
-  strcpy(result, text.c_str());
  delete res_it;
-  return result;
+  return copy_string(alto_str.str());
 }

 } // namespace tesseract
--- a/src/api/baseapi.cpp
+++ b/src/api/baseapi.cpp
@ -33,7 +33,7 @@
 #include "equationdetect.h" // for EquationDetect, destructor of equ_detect_
 #endif // ndef DISABLED_LEGACY_ENGINE
 #include "errcode.h" // for ASSERT_HOST
-#include "helpers.h" // for IntCastRounded, chomp_string
+#include "helpers.h" // for IntCastRounded, chomp_string, copy_string
 #include "host.h"    // for MAX_PATH
 #include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...
 #ifndef DISABLED_LEGACY_ENGINE
@ -1378,9 +1378,7 @@ char *TessBaseAPI::GetUTF8Text() {
    const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
    text += para_text.get();
  } while (it->Next(RIL_PARA));
-  char *result = new char[text.length() + 1];
-  strncpy(result, text.c_str(), text.length() + 1);
-  return result;
+  return copy_string(text);
 }

 static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) {
@ -1509,9 +1507,7 @@ char *TessBaseAPI::GetTSVText(int page_number) {
 #endif
  }

-  char *ret = new char[tsv_str.length() + 1];
-  strcpy(ret, tsv_str.c_str());
-  return ret;
+  return copy_string(tsv_str);
 }

 /** The 5 numbers output for each box (the usual 4 and a page number.) */
@ -1759,10 +1755,7 @@ char *TessBaseAPI::GetOsdText(int page_number) {
         << "Orientation confidence: " << orient_conf << "\n"
         << "Script: " << script_name << "\n"
         << "Script confidence: " << script_conf << "\n";
-  const std::string &text = stream.str();
-  char *result = new char[text.length() + 1];
-  strcpy(result, text.c_str());
-  return result;
+  return copy_string(stream.str());
 }

 #endif // ndef DISABLED_LEGACY_ENGINE
--- a/src/api/hocrrenderer.cpp
+++ b/src/api/hocrrenderer.cpp
@ -25,6 +25,7 @@
 #  include "host.h" // windows.h for MultiByteToWideChar, ...
 #endif
 #include <tesseract/renderer.h>
+#include "helpers.h"        // for copy_string
 #include "tesseractclass.h" // for Tesseract

 namespace tesseract {
@ -480,10 +481,7 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
  }
  hocr_str << "  </div>\n";

-  const std::string &text = hocr_str.str();
-  char *result = new char[text.length() + 1];
-  strcpy(result, text.c_str());
-  return result;
+  return copy_string(hocr_str.str());
 }

 /**********************************************************************
--- a/src/api/lstmboxrenderer.cpp
+++ b/src/api/lstmboxrenderer.cpp
@ -18,6 +18,7 @@

 #include <tesseract/baseapi.h> // for TessBaseAPI
 #include <tesseract/renderer.h>
+#include "helpers.h"        // for copy_string
 #include "tesseractclass.h" // for Tesseract

 namespace tesseract {
@ -81,10 +82,8 @@ char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) {
    AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
    lstm_box_str += "\n"; // end of PAGE
  }
-  char *ret = new char[lstm_box_str.length() + 1];
-  strcpy(ret, lstm_box_str.c_str());
  delete res_it;
-  return ret;
+  return copy_string(lstm_box_str);
 }

 /**********************************************************************
--- a/src/api/pagerenderer.cpp
+++ b/src/api/pagerenderer.cpp
@ -14,6 +14,7 @@
 // limitations under the License.

 #include "errcode.h" // for ASSERT_HOST
+#include "helpers.h" // for copy_string
 #ifdef _WIN32
 #  include "host.h" // windows.h for MultiByteToWideChar, ...
 #endif
@ -1143,15 +1144,8 @@ char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
  const std::string &text = reading_order_str.str();
  reading_order_str.str("");

-  // Allocate memory for result to hold text.length() characters plus a null
-  // terminator Safely copy the string into result, ensuring no overflow strncpy
-  // does not necessarily null-terminate the destination, so do it manually
-  char *result = new char[text.length() + 1];
-  strncpy(result, text.c_str(), text.length());
-  result[text.length()] = '\0';
-
  delete res_it;
-  return result;
+  return copy_string(text);
 }

 } // namespace tesseract
--- a/src/api/pdfrenderer.cpp
+++ b/src/api/pdfrenderer.cpp
@ -22,7 +22,7 @@

 #include "pdf_ttf.h"
 #include "tprintf.h"
-#include "helpers.h" // for Swap
+#include "helpers.h" // for Swap, copy_string

 #include <allheaders.h>
 #include <tesseract/baseapi.h>
@ -497,10 +497,7 @@ char *TessPDFRenderer::GetPDFTextObjects(TessBaseAPI *api, double width, double
      pdf_str << "ET\n"; // end the text object
    }
  }
-  const std::string &text = pdf_str.str();
-  char *result = new char[text.length() + 1];
-  strcpy(result, text.c_str());
-  return result;
+  return copy_string(pdf_str.str());
 }

 bool TessPDFRenderer::BeginDocumentHandler() {
--- a/src/api/wordstrboxrenderer.cpp
+++ b/src/api/wordstrboxrenderer.cpp
@ -18,6 +18,7 @@

 #include <tesseract/baseapi.h> // for TessBaseAPI
 #include <tesseract/renderer.h>
+#include "helpers.h"        // for copy_string
 #include "tesseractclass.h" // for Tesseract

 namespace tesseract {
@ -80,10 +81,8 @@ char *TessBaseAPI::GetWordStrBoxText(int page_number = 0) {
    wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
    wordstr_box_str += "\n";
  }
-  char *ret = new char[wordstr_box_str.length() + 1];
-  strcpy(ret, wordstr_box_str.c_str());
  delete res_it;
-  return ret;
+  return copy_string(wordstr_box_str);
 }

 /**********************************************************************
--- a/src/ccmain/ltrresultiterator.cpp
+++ b/src/ccmain/ltrresultiterator.cpp
@ -19,6 +19,7 @@

 #include <tesseract/ltrresultiterator.h>

+#include "helpers.h"  // for copy_string
 #include "pageres.h"
 #include "tesseractclass.h"

@ -76,10 +77,7 @@ char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
      }
    } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
  }
-  int length = text.length() + 1;
-  char *result = new char[length];
-  strncpy(result, text.c_str(), length);
-  return result;
+  return copy_string(text);
 }

 // Set the string inserted at the end of each text line. "\n" by default.
@ -310,11 +308,7 @@ char *LTRResultIterator::WordTruthUTF8Text() const {
  if (!HasTruthString()) {
    return nullptr;
  }
-  std::string truth_text = it_->word()->blamer_bundle->TruthString();
-  int length = truth_text.length() + 1;
-  char *result = new char[length];
-  strncpy(result, truth_text.c_str(), length);
-  return result;
+  return copy_string(it_->word()->blamer_bundle->TruthString());
 }

 // Returns the null terminated UTF-8 encoded normalized OCR string for the
@ -330,10 +324,7 @@ char *LTRResultIterator::WordNormedUTF8Text() const {
  for (unsigned i = 0; i < best_choice->length(); ++i) {
    ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
  }
-  auto length = ocr_text.length() + 1;
-  char *result = new char[length];
-  strncpy(result, ocr_text.c_str(), length);
-  return result;
+  return copy_string(ocr_text);
 }

 // Returns a pointer to serialized choice lattice.
--- a/src/ccmain/resultiterator.cpp
+++ b/src/ccmain/resultiterator.cpp
@ -20,6 +20,7 @@

 #include <tesseract/resultiterator.h>

+#include "helpers.h"  // for copy_string
 #include "pageres.h"
 #include "tesseractclass.h"
 #include "unicharset.h"
@ -681,10 +682,7 @@ char *ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
      }
    } break;
  }
-  int length = text.length() + 1;
-  char *result = new char[length];
-  strncpy(result, text.c_str(), length);
-  return result;
+  return copy_string(text);
 }
 std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
    *ResultIterator::GetRawLSTMTimesteps() const {
--- a/src/ccutil/helpers.h
+++ b/src/ccutil/helpers.h
@ -35,6 +35,17 @@

 namespace tesseract {

+// Copy a std::string to a newly allocated char *.
+// TODO: Remove this function once the related code has been converted
+// to use std::string.
+inline char *copy_string(const std::string &from) {
+  auto length = from.length();
+  char *target_string = new char[length + 1];
+  from.copy(target_string, length);
+  target_string[length] = '\0';
+  return target_string;
+}
+
 template <class T>
 inline bool contains(const std::vector<T> &data, const T &value) {
  return std::find(data.begin(), data.end(), value) != data.end();