tesseract/unittest/third_party/utf/rune.c

/*
 * The authors of this software are Rob Pike and Ken Thompson.
 *              Copyright (c) 2002 by Lucent Technologies.
 * Permission to use, copy, modify, and distribute this software for any
 * purpose without fee is hereby granted, provided that this entire notice
 * is included in all copies of any software which is or includes a copy
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
#include <stdarg.h>
#include <string.h>
#include "third_party/utf/utf.h"
#include "third_party/utf/utfdef.h"

enum
{
	Bit1	= 7,
	Bitx	= 6,
	Bit2	= 5,
	Bit3	= 4,
	Bit4	= 3,
	Bit5	= 2, 

	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */

	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
	Rune4	= (1<<(Bit4+3*Bitx))-1,
                                        /* 0001 1111 1111 1111 1111 1111 */

	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */

	Bad	= Runeerror,
};

/*
 * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
 * This is a slower but "safe" version of the old chartorune 
 * that works on strings that are not necessarily null-terminated.
 * 
 * If you know for sure that your string is null-terminated,
 * chartorune will be a bit faster.
 *
 * It is guaranteed not to attempt to access "length"
 * past the incoming pointer.  This is to avoid
 * possible access violations.  If the string appears to be
 * well-formed but incomplete (i.e., to get the whole Rune
 * we'd need to read past str+length) then we'll set the Rune
 * to Bad and return 0.
 *
 * Note that if we have decoding problems for other
 * reasons, we return 1 instead of 0.
 */
int
charntorune(Rune *rune, const char *str, int length)
{
	int c, c1, c2, c3;
	long l;

	/* When we're not allowed to read anything */
	if(length <= 0) {
		goto badlen;
	}

	/*
	 * one character sequence (7-bit value)
	 *	00000-0007F => T1
	 */
	c = *(uchar*)str;
	if(c < Tx) {
		*rune = c;
		return 1;
	}

	// If we can't read more than one character we must stop
	if(length <= 1) {
		goto badlen;
	}

	/*
	 * two character sequence (11-bit value)
	 *	0080-07FF => T2 Tx
	 */
	c1 = *(uchar*)(str+1) ^ Tx;
	if(c1 & Testx)
		goto bad;
	if(c < T3) {
		if(c < T2)
			goto bad;
		l = ((c << Bitx) | c1) & Rune2;
		if(l <= Rune1)
			goto bad;
		*rune = l;
		return 2;
	}

	// If we can't read more than two characters we must stop
	if(length <= 2) {
		goto badlen;
	}

	/*
	 * three character sequence (16-bit value)
	 *	0800-FFFF => T3 Tx Tx
	 */
	c2 = *(uchar*)(str+2) ^ Tx;
	if(c2 & Testx)
		goto bad;
	if(c < T4) {
		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
		if(l <= Rune2)
			goto bad;
		*rune = l;
		return 3;
	}

	if (length <= 3)
		goto badlen;

	/*
	 * four character sequence (21-bit value)
	 *	10000-1FFFFF => T4 Tx Tx Tx
	 */
	c3 = *(uchar*)(str+3) ^ Tx;
	if (c3 & Testx)
		goto bad;
	if (c < T5) {
		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
		if (l <= Rune3)
			goto bad;
		if (l > Runemax)
			goto bad;
		*rune = l;
		return 4;
	}

	// Support for 5-byte or longer UTF-8 would go here, but
	// since we don't have that, we'll just fall through to bad.

	/*
	 * bad decoding
	 */
bad:
	*rune = Bad;
	return 1;
badlen:
	*rune = Bad;
	return 0;

}


/*
 * This is the older "unsafe" version, which works fine on 
 * null-terminated strings.
 */
int
chartorune(Rune *rune, const char *str)
{
	int c, c1, c2, c3;
	long l;

	/*
	 * one character sequence
	 *	00000-0007F => T1
	 */
	c = *(uchar*)str;
	if(c < Tx) {
		*rune = c;
		return 1;
	}

	/*
	 * two character sequence
	 *	0080-07FF => T2 Tx
	 */
	c1 = *(uchar*)(str+1) ^ Tx;
	if(c1 & Testx)
		goto bad;
	if(c < T3) {
		if(c < T2)
			goto bad;
		l = ((c << Bitx) | c1) & Rune2;
		if(l <= Rune1)
			goto bad;
		*rune = l;
		return 2;
	}

	/*
	 * three character sequence
	 *	0800-FFFF => T3 Tx Tx
	 */
	c2 = *(uchar*)(str+2) ^ Tx;
	if(c2 & Testx)
		goto bad;
	if(c < T4) {
		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
		if(l <= Rune2)
			goto bad;
		*rune = l;
		return 3;
	}

	/*
	 * four character sequence (21-bit value)
	 *	10000-1FFFFF => T4 Tx Tx Tx
	 */
	c3 = *(uchar*)(str+3) ^ Tx;
	if (c3 & Testx)
		goto bad;
	if (c < T5) {
		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
		if (l <= Rune3)
			goto bad;
		if (l > Runemax)
			goto bad;
		*rune = l;
		return 4;
	}

	/*
	 * Support for 5-byte or longer UTF-8 would go here, but
	 * since we don't have that, we'll just fall through to bad.
	 */

	/*
	 * bad decoding
	 */
bad:
	*rune = Bad;
	return 1;
}

int
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
	*consumed = charntorune(rune, str, length);
	return *rune != Runeerror || *consumed == 3;
}
    
int
runetochar(char *str, const Rune *rune)
{
	/* Runes are signed, so convert to unsigned for range check. */
	unsigned long c;

	/*
	 * one character sequence
	 *	00000-0007F => 00-7F
	 */
	c = *rune;
	if(c <= Rune1) {
		str[0] = c;
		return 1;
	}

	/*
	 * two character sequence
	 *	0080-07FF => T2 Tx
	 */
	if(c <= Rune2) {
		str[0] = T2 | (c >> 1*Bitx);
		str[1] = Tx | (c & Maskx);
		return 2;
	}

	/*
	 * If the Rune is out of range, convert it to the error rune.
	 * Do this test here because the error rune encodes to three bytes.
	 * Doing it earlier would duplicate work, since an out of range
	 * Rune wouldn't have fit in one or two bytes.
	 */
	if (c > Runemax)
		c = Runeerror;

	/*
	 * three character sequence
	 *	0800-FFFF => T3 Tx Tx
	 */
	if (c <= Rune3) {
		str[0] = T3 |  (c >> 2*Bitx);
		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
		str[2] = Tx |  (c & Maskx);
		return 3;
	}

	/*
	 * four character sequence (21-bit value)
	 *     10000-1FFFFF => T4 Tx Tx Tx
	 */
	str[0] = T4 | (c >> 3*Bitx);
	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
	str[3] = Tx | (c & Maskx);
	return 4;
}

int
runelen(Rune rune)
{
	char str[10];

	return runetochar(str, &rune);
}

int
runenlen(const Rune *r, int nrune)
{
	int nb;
	ulong c;	/* Rune is signed, so use unsigned for range check. */

	nb = 0;
	while(nrune--) {
		c = *r++;
		if (c <= Rune1)
			nb++;
		else if (c <= Rune2)
			nb += 2;
		else if (c <= Rune3)
			nb += 3;
		else if (c <= Runemax)
			nb += 4;
		else
			nb += 3;	/* Runeerror = 0xFFFD, see runetochar */
	}
	return nb;
}

int
fullrune(const char *str, int n)
{
	if (n > 0) {
		int c = *(uchar*)str;
		if (c < Tx)
			return 1;
		if (n > 1) {
			if (c < T3)
				return 1;
			if (n > 2) {
				if (c < T4 || n > 3)
					return 1;
			}
		}
	}
	return 0;
}
Add code from tensorflow/models The new code was copied from the latest code on GitHub (https://github.com/tensorflow/models/tree/master/research/syntaxnet). It is required for pango_font_info_test and other unit tests. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2019-06-24 18:33:26 +08:00			`/*`
			`* The authors of this software are Rob Pike and Ken Thompson.`
			`* Copyright (c) 2002 by Lucent Technologies.`
			`* Permission to use, copy, modify, and distribute this software for any`
			`* purpose without fee is hereby granted, provided that this entire notice`
			`* is included in all copies of any software which is or includes a copy`
			`* or modification of this software and in all copies of the supporting`
			`* documentation for such software.`
			`* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED`
			`* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY`
			`* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY`
			`* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.`
			`*/`
			`#include <stdarg.h>`
			`#include <string.h>`
			`#include "third_party/utf/utf.h"`
			`#include "third_party/utf/utfdef.h"`

			`enum`
			`{`
			`Bit1 = 7,`
			`Bitx = 6,`
			`Bit2 = 5,`
			`Bit3 = 4,`
			`Bit4 = 3,`
			`Bit5 = 2,`

			`T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */`
			`Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */`
			`T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */`
			`T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */`
			`T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */`
			`T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */`

			`Rune1 = (1<<(Bit1+0Bitx))-1, / 0000 0000 0111 1111 */`
			`Rune2 = (1<<(Bit2+1Bitx))-1, / 0000 0111 1111 1111 */`
			`Rune3 = (1<<(Bit3+2Bitx))-1, / 1111 1111 1111 1111 */`
			`Rune4 = (1<<(Bit4+3*Bitx))-1,`
			`/* 0001 1111 1111 1111 1111 1111 */`

			`Maskx = (1<<Bitx)-1, /* 0011 1111 */`
			`Testx = Maskx ^ 0xFF, /* 1100 0000 */`

			`Bad = Runeerror,`
			`};`

			`/*`
			`* Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24`
			`* This is a slower but "safe" version of the old chartorune`
			`* that works on strings that are not necessarily null-terminated.`
			`*`
			`* If you know for sure that your string is null-terminated,`
			`* chartorune will be a bit faster.`
			`*`
			`* It is guaranteed not to attempt to access "length"`
			`* past the incoming pointer. This is to avoid`
			`* possible access violations. If the string appears to be`
			`* well-formed but incomplete (i.e., to get the whole Rune`
			`* we'd need to read past str+length) then we'll set the Rune`
			`* to Bad and return 0.`
			`*`
			`* Note that if we have decoding problems for other`
			`* reasons, we return 1 instead of 0.`
			`*/`
			`int`
			`charntorune(Rune rune, const char str, int length)`
			`{`
			`int c, c1, c2, c3;`
			`long l;`

			`/* When we're not allowed to read anything */`
			`if(length <= 0) {`
			`goto badlen;`
			`}`

			`/*`
			`* one character sequence (7-bit value)`
			`* 00000-0007F => T1`
			`*/`
			`c = (uchar)str;`
			`if(c < Tx) {`
			`*rune = c;`
			`return 1;`
			`}`

			`// If we can't read more than one character we must stop`
			`if(length <= 1) {`
			`goto badlen;`
			`}`

			`/*`
			`* two character sequence (11-bit value)`
			`* 0080-07FF => T2 Tx`
			`*/`
			`c1 = (uchar)(str+1) ^ Tx;`
			`if(c1 & Testx)`
			`goto bad;`
			`if(c < T3) {`
			`if(c < T2)`
			`goto bad;`
			`l = ((c << Bitx) \| c1) & Rune2;`
			`if(l <= Rune1)`
			`goto bad;`
			`*rune = l;`
			`return 2;`
			`}`

			`// If we can't read more than two characters we must stop`
			`if(length <= 2) {`
			`goto badlen;`
			`}`

			`/*`
			`* three character sequence (16-bit value)`
			`* 0800-FFFF => T3 Tx Tx`
			`*/`
			`c2 = (uchar)(str+2) ^ Tx;`
			`if(c2 & Testx)`
			`goto bad;`
			`if(c < T4) {`
			`l = ((((c << Bitx) \| c1) << Bitx) \| c2) & Rune3;`
			`if(l <= Rune2)`
			`goto bad;`
			`*rune = l;`
			`return 3;`
			`}`

			`if (length <= 3)`
			`goto badlen;`

			`/*`
			`* four character sequence (21-bit value)`
			`* 10000-1FFFFF => T4 Tx Tx Tx`
			`*/`
			`c3 = (uchar)(str+3) ^ Tx;`
			`if (c3 & Testx)`
			`goto bad;`
			`if (c < T5) {`
			`l = ((((((c << Bitx) \| c1) << Bitx) \| c2) << Bitx) \| c3) & Rune4;`
			`if (l <= Rune3)`
			`goto bad;`
			`if (l > Runemax)`
			`goto bad;`
			`*rune = l;`
			`return 4;`
			`}`

			`// Support for 5-byte or longer UTF-8 would go here, but`
			`// since we don't have that, we'll just fall through to bad.`

			`/*`
			`* bad decoding`
			`*/`
			`bad:`
			`*rune = Bad;`
			`return 1;`
			`badlen:`
			`*rune = Bad;`
			`return 0;`

			`}`


			`/*`
			`* This is the older "unsafe" version, which works fine on`
			`* null-terminated strings.`
			`*/`
			`int`
			`chartorune(Rune rune, const char str)`
			`{`
			`int c, c1, c2, c3;`
			`long l;`

			`/*`
			`* one character sequence`
			`* 00000-0007F => T1`
			`*/`
			`c = (uchar)str;`
			`if(c < Tx) {`
			`*rune = c;`
			`return 1;`
			`}`

			`/*`
			`* two character sequence`
			`* 0080-07FF => T2 Tx`
			`*/`
			`c1 = (uchar)(str+1) ^ Tx;`
			`if(c1 & Testx)`
			`goto bad;`
			`if(c < T3) {`
			`if(c < T2)`
			`goto bad;`
			`l = ((c << Bitx) \| c1) & Rune2;`
			`if(l <= Rune1)`
			`goto bad;`
			`*rune = l;`
			`return 2;`
			`}`

			`/*`
			`* three character sequence`
			`* 0800-FFFF => T3 Tx Tx`
			`*/`
			`c2 = (uchar)(str+2) ^ Tx;`
			`if(c2 & Testx)`
			`goto bad;`
			`if(c < T4) {`
			`l = ((((c << Bitx) \| c1) << Bitx) \| c2) & Rune3;`
			`if(l <= Rune2)`
			`goto bad;`
			`*rune = l;`
			`return 3;`
			`}`

			`/*`
			`* four character sequence (21-bit value)`
			`* 10000-1FFFFF => T4 Tx Tx Tx`
			`*/`
			`c3 = (uchar)(str+3) ^ Tx;`
			`if (c3 & Testx)`
			`goto bad;`
			`if (c < T5) {`
			`l = ((((((c << Bitx) \| c1) << Bitx) \| c2) << Bitx) \| c3) & Rune4;`
			`if (l <= Rune3)`
			`goto bad;`
			`if (l > Runemax)`
			`goto bad;`
			`*rune = l;`
			`return 4;`
			`}`

			`/*`
			`* Support for 5-byte or longer UTF-8 would go here, but`
			`* since we don't have that, we'll just fall through to bad.`
			`*/`

			`/*`
			`* bad decoding`
			`*/`
			`bad:`
			`*rune = Bad;`
			`return 1;`
			`}`

			`int`
			`isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {`
			`*consumed = charntorune(rune, str, length);`
			`return rune != Runeerror \|\| consumed == 3;`
			`}`

			`int`
			`runetochar(char str, const Rune rune)`
			`{`
			`/* Runes are signed, so convert to unsigned for range check. */`
			`unsigned long c;`

			`/*`
			`* one character sequence`
			`* 00000-0007F => 00-7F`
			`*/`
			`c = *rune;`
			`if(c <= Rune1) {`
			`str[0] = c;`
			`return 1;`
			`}`

			`/*`
			`* two character sequence`
			`* 0080-07FF => T2 Tx`
			`*/`
			`if(c <= Rune2) {`
			`str[0] = T2 \| (c >> 1*Bitx);`
			`str[1] = Tx \| (c & Maskx);`
			`return 2;`
			`}`

			`/*`
			`* If the Rune is out of range, convert it to the error rune.`
			`* Do this test here because the error rune encodes to three bytes.`
			`* Doing it earlier would duplicate work, since an out of range`
			`* Rune wouldn't have fit in one or two bytes.`
			`*/`
			`if (c > Runemax)`
			`c = Runeerror;`

			`/*`
			`* three character sequence`
			`* 0800-FFFF => T3 Tx Tx`
			`*/`
			`if (c <= Rune3) {`
			`str[0] = T3 \| (c >> 2*Bitx);`
			`str[1] = Tx \| ((c >> 1*Bitx) & Maskx);`
			`str[2] = Tx \| (c & Maskx);`
			`return 3;`
			`}`

			`/*`
			`* four character sequence (21-bit value)`
			`* 10000-1FFFFF => T4 Tx Tx Tx`
			`*/`
			`str[0] = T4 \| (c >> 3*Bitx);`
			`str[1] = Tx \| ((c >> 2*Bitx) & Maskx);`
			`str[2] = Tx \| ((c >> 1*Bitx) & Maskx);`
			`str[3] = Tx \| (c & Maskx);`
			`return 4;`
			`}`

			`int`
			`runelen(Rune rune)`
			`{`
			`char str[10];`

			`return runetochar(str, &rune);`
			`}`

			`int`
			`runenlen(const Rune *r, int nrune)`
			`{`
			`int nb;`
			`ulong c; /* Rune is signed, so use unsigned for range check. */`

			`nb = 0;`
			`while(nrune--) {`
			`c = *r++;`
			`if (c <= Rune1)`
			`nb++;`
			`else if (c <= Rune2)`
			`nb += 2;`
			`else if (c <= Rune3)`
			`nb += 3;`
			`else if (c <= Runemax)`
			`nb += 4;`
			`else`
			`nb += 3; /* Runeerror = 0xFFFD, see runetochar */`
			`}`
			`return nb;`
			`}`

			`int`
			`fullrune(const char *str, int n)`
			`{`
			`if (n > 0) {`
			`int c = (uchar)str;`
			`if (c < Tx)`
			`return 1;`
			`if (n > 1) {`
			`if (c < T3)`
			`return 1;`
			`if (n > 2) {`
			`if (c < T4 \|\| n > 3)`
			`return 1;`
			`}`
			`}`
			`}`
			`return 0;`
			`}`