/********************************************************************** * File: degradeimage.cpp * Description: Function to degrade an image (usually of text) as if it * has been printed and then scanned. * Authors: Ray Smith * Created: Tue Nov 19 2013 * * (C) Copyright 2013, Google Inc. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **********************************************************************/ #include "degradeimage.h" #include #include "allheaders.h" // from leptonica #include "genericvector.h" #include "helpers.h" // For TRand. #include "rect.h" namespace tesseract { // A randomized perspective distortion can be applied to synthetic input. // The perspective distortion comes from leptonica, which uses 2 sets of 4 // corners to determine the distortion. There are random values for each of // the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead // defined in terms of a single shear value. This reduces the degrees of // freedom enough to make the distortion more realistic than it would otherwise // be if all 8 coordinates could move independently. // One additional factor is used for the color of the pixels that don't exist // in the source image. // Name for each of the randomizing factors. enum FactorNames { FN_INCOLOR, FN_Y0, FN_Y1, FN_Y2, FN_Y3, FN_X0, FN_X1, FN_SHEAR, // x2 = x1 - shear // x3 = x0 + shear FN_NUM_FACTORS }; // Rotation is +/- kRotationRange radians. const float kRotationRange = 0.02f; // Number of grey levels to shift by for each exposure step. const int kExposureFactor = 16; // Salt and pepper noise is +/- kSaltnPepper. const int kSaltnPepper = 5; // Min sum of width + height on which to operate the ramp. const int kMinRampSize = 1000; // Degrade the pix as if by a print/copy/scan cycle with exposure > 0 // corresponding to darkening on the copier and <0 lighter and 0 not copied. // Exposures in [-2,2] are most useful, with -3 and 3 being extreme. // If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the pix // is rotated by *rotation else it is randomly rotated and *rotation is // modified. // // HOW IT WORKS: // Most of the process is really dictated by the fact that the minimum // available convolution is 3X3, which is too big really to simulate a // good quality print/scan process. (2X2 would be better.) // 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the // images generally biased to being too light, so most of the work is to make // them darker. 3 levels of thickening/darkening are achieved with 2 dilations, // (using a greyscale erosion) one heavy (by being before convolution) and one // light (after convolution). // With no dilation, after covolution, the images are so light that a heavy // constant offset is required to make the 0 image look reasonable. A simple // constant offset multiple of exposure to undo this value is enough to achieve // all the required lightening. This gives the advantage that exposure level 1 // with a single dilation gives a good impression of the broken-yet-too-dark // problem that is often seen in scans. // A small random rotation gives some varying greyscale values on the edges, // and some random salt and pepper noise on top helps to realistically jaggy-up // the edges. // Finally a greyscale ramp provides a continuum of effects between exposure // levels. Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer, float* rotation) { Pix* pix = pixConvertTo8(input, false); pixDestroy(&input); input = pix; int width = pixGetWidth(input); int height = pixGetHeight(input); if (exposure >= 2) { // An erosion simulates the spreading darkening of a dark copy. // This is backwards to binary morphology, // see http://www.leptonica.com/grayscale-morphology.html pix = input; input = pixErodeGray(pix, 3, 3); pixDestroy(&pix); } // A convolution is essential to any mode as no scanner produces an // image as sharp as the electronic image. pix = pixBlockconv(input, 1, 1); pixDestroy(&input); // A small random rotation helps to make the edges jaggy in a realistic way. if (rotation != nullptr) { float radians_clockwise = 0.0f; if (*rotation) { radians_clockwise = *rotation; } else if (randomizer != nullptr) { radians_clockwise = randomizer->SignedRand(kRotationRange); } input = pixRotate(pix, radians_clockwise, L_ROTATE_AREA_MAP, L_BRING_IN_WHITE, 0, 0); // Rotate the boxes to match. *rotation = radians_clockwise; pixDestroy(&pix); } else { input = pix; } if (exposure >= 3 || exposure == 1) { // Erosion after the convolution is not as heavy as before, so it is // good for level 1 and in addition as a level 3. // This is backwards to binary morphology, // see http://www.leptonica.com/grayscale-morphology.html pix = input; input = pixErodeGray(pix, 3, 3); pixDestroy(&pix); } // The convolution really needed to be 2x2 to be realistic enough, but // we only have 3x3, so we have to bias the image darker or lose thin // strokes. int erosion_offset = 0; // For light and 0 exposure, there is no dilation, so compensate for the // convolution with a big darkening bias which is undone for lighter // exposures. if (exposure <= 0) erosion_offset = -3 * kExposureFactor; // Add in a general offset of the greyscales for the exposure level so // a threshold of 128 gives a reasonable binary result. erosion_offset -= exposure * kExposureFactor; // Add a gradual fade over the page and a small amount of salt and pepper // noise to simulate noise in the sensor/paper fibres and varying // illumination. l_uint32* data = pixGetData(input); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { int pixel = GET_DATA_BYTE(data, x); if (randomizer != nullptr) pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper; if (height + width > kMinRampSize) pixel -= (2*x + y) * 32 / (height + width); pixel += erosion_offset; if (pixel < 0) pixel = 0; if (pixel > 255) pixel = 255; SET_DATA_BYTE(data, x, pixel); } data += input->wpl; } return input; } // Creates and returns a Pix distorted by various means according to the bool // flags. If boxes is not nullptr, the boxes are resized/positioned according to // any spatial distortion and also by the integer reduction factor box_scale // so they will match what the network will output. // Returns nullptr on error. The returned Pix must be pixDestroyed. Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, bool white_noise, bool smooth_noise, bool blur, int box_reduction, TRand* randomizer, GenericVector* boxes) { Pix* distorted = pixCopy(nullptr, const_cast(pix)); // Things to do to synthetic training data. if (invert && randomizer->SignedRand(1.0) < 0) pixInvert(distorted, distorted); if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) { // TODO(rays) Cook noise in a more thread-safe manner than rand(). // Attempt to make the sequences reproducible. srand(randomizer->IntRand()); Pix* pixn = pixAddGaussianNoise(distorted, 8.0); pixDestroy(&distorted); if (smooth_noise) { distorted = pixBlockconv(pixn, 1, 1); pixDestroy(&pixn); } else { distorted = pixn; } } if (blur && randomizer->SignedRand(1.0) > 0.0) { Pix* blurred = pixBlockconv(distorted, 1, 1); pixDestroy(&distorted); distorted = blurred; } if (perspective) GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes); if (boxes != nullptr) { for (int b = 0; b < boxes->size(); ++b) { (*boxes)[b].scale(1.0f / box_reduction); if ((*boxes)[b].width() <= 0) (*boxes)[b].set_right((*boxes)[b].left() + 1); } } return distorted; } // Distorts anything that has a non-null pointer with the same pseudo-random // perspective distortion. Width and height only need to be set if there // is no pix. If there is a pix, then they will be taken from there. void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, Pix** pix, GenericVector* boxes) { if (pix != nullptr && *pix != nullptr) { width = pixGetWidth(*pix); height = pixGetHeight(*pix); } float* im_coeffs = nullptr; float* box_coeffs = nullptr; l_int32 incolor = ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs); if (pix != nullptr && *pix != nullptr) { // Transform the image. Pix* transformed = pixProjective(*pix, im_coeffs, incolor); if (transformed == nullptr) { tprintf("Projective transformation failed!!\n"); return; } pixDestroy(pix); *pix = transformed; } if (boxes != nullptr) { // Transform the boxes. for (int b = 0; b < boxes->size(); ++b) { int x1, y1, x2, y2; const TBOX& box = (*boxes)[b]; projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1, &y1); projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(), &x2, &y2); TBOX new_box1(x1, height - y2, x2, height - y1); projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(), &x1, &y1); projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2, &y2); TBOX new_box2(x1, height - y1, x2, height - y2); (*boxes)[b] = new_box1.bounding_union(new_box2); } } free(im_coeffs); free(box_coeffs); } // Computes the coefficients of a randomized projective transformation. // The image transform requires backward transformation coefficient, and the // box transform the forward coefficients. // Returns the incolor arg to pixProjective. int ProjectiveCoeffs(int width, int height, TRand* randomizer, float** im_coeffs, float** box_coeffs) { // Setup "from" points. Pta* src_pts = ptaCreate(4); ptaAddPt(src_pts, 0.0f, 0.0f); ptaAddPt(src_pts, width, 0.0f); ptaAddPt(src_pts, width, height); ptaAddPt(src_pts, 0.0f, height); // Extract factors from pseudo-random sequence. float factors[FN_NUM_FACTORS]; float shear = 0.0f; // Shear is signed. for (int i = 0; i < FN_NUM_FACTORS; ++i) { // Everything is squared to make wild values rarer. if (i == FN_SHEAR) { // Shear is signed. shear = randomizer->SignedRand(0.5 / 3.0); shear = shear >= 0.0 ? shear * shear : -shear * shear; // Keep the sheared points within the original rectangle. if (shear < -factors[FN_X0]) shear = -factors[FN_X0]; if (shear > factors[FN_X1]) shear = factors[FN_X1]; factors[i] = shear; } else if (i != FN_INCOLOR) { factors[i] = fabs(randomizer->SignedRand(1.0)); if (i <= FN_Y3) factors[i] *= 5.0 / 8.0; else factors[i] *= 0.5; factors[i] *= factors[i]; } } // Setup "to" points. Pta* dest_pts = ptaCreate(4); ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height); ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height); ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width, (1 - factors[FN_Y2]) * height); ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width, (1 - factors[FN_Y3]) * height); getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs); getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs); ptaDestroy(&src_pts); ptaDestroy(&dest_pts); return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK; } } // namespace tesseract