Blame - source/i18n/csrucode.cpp - chromium.googlesource.com/chromium/deps/icu

blob: e0a64aa949a89406eef17ba610a4461436d0374a [file] [log] [blame]

Jungshik Shin	87232d8	2017-05-13 21:10:13 -0700	[diff] [blame]	1	// © 2016 and later: Unicode, Inc. and others.
Jungshik Shin	5feb9ad	2016-10-21 12:52:48 -0700	[diff] [blame]	2	// License & terms of use: http://www.unicode.org/copyright.html
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	3	/*
				4	**********************************************************************
				5	* Copyright (C) 2005-2013, International Business Machines
				6	* Corporation and others. All Rights Reserved.
				7	**********************************************************************
				8	*/
				9
				10	#include "unicode/utypes.h"
				11
				12	#if !UCONFIG_NO_CONVERSION
				13
				14	#include "csrucode.h"
				15	#include "csmatch.h"
				16
				17	U_NAMESPACE_BEGIN
				18
				19	CharsetRecog_Unicode::~CharsetRecog_Unicode()
				20	{
				21	// nothing to do
				22	}
				23
				24	CharsetRecog_UTF_16_BE::~CharsetRecog_UTF_16_BE()
				25	{
				26	// nothing to do
				27	}
				28
				29	const char *CharsetRecog_UTF_16_BE::getName() const
				30	{
				31	return "UTF-16BE";
				32	}
				33
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	34	// UTF-16 confidence calculation. Very simple minded, but better than nothing.
				35	// Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
				36	// and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
				37	// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
				38	// NULs should be rare in actual text.
				39
				40	static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {
				41	if (codeUnit == 0) {
				42	confidence -= 10;
				43	} else if ((codeUnit >= 0x20 && codeUnit <= 0xff) \|\| codeUnit == 0x0a) {
				44	confidence += 10;
				45	}
				46	if (confidence < 0) {
				47	confidence = 0;
				48	} else if (confidence > 100) {
				49	confidence = 100;
				50	}
				51	return confidence;
				52	}
				53
				54
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	55	UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const
				56	{
				57	const uint8_t *input = textIn->fRawInput;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	58	int32_t confidence = 10;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	59	int32_t length = textIn->fRawLength;
				60
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	61	int32_t bytesToCheck = (length > 30) ? 30 : length;
				62	for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
				63	UChar codeUnit = (input[charIndex] << 8) \| input[charIndex + 1];
				64	if (charIndex == 0 && codeUnit == 0xFEFF) {
				65	confidence = 100;
				66	break;
				67	}
				68	confidence = adjustConfidence(codeUnit, confidence);
				69	if (confidence == 0 \|\| confidence == 100) {
				70	break;
				71	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	72	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	73	if (bytesToCheck < 4 && confidence < 100) {
				74	confidence = 0;
				75	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	76	results->set(textIn, this, confidence);
				77	return (confidence > 0);
				78	}
				79
				80	CharsetRecog_UTF_16_LE::~CharsetRecog_UTF_16_LE()
				81	{
				82	// nothing to do
				83	}
				84
				85	const char *CharsetRecog_UTF_16_LE::getName() const
				86	{
				87	return "UTF-16LE";
				88	}
				89
				90	UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const
				91	{
				92	const uint8_t *input = textIn->fRawInput;
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	93	int32_t confidence = 10;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	94	int32_t length = textIn->fRawLength;
				95
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	96	int32_t bytesToCheck = (length > 30) ? 30 : length;
				97	for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
				98	UChar codeUnit = input[charIndex] \| (input[charIndex + 1] << 8);
				99	if (charIndex == 0 && codeUnit == 0xFEFF) {
				100	confidence = 100; // UTF-16 BOM
				101	if (length >= 4 && input[2] == 0 && input[3] == 0) {
				102	confidence = 0; // UTF-32 BOM
				103	}
				104	break;
				105	}
				106	confidence = adjustConfidence(codeUnit, confidence);
				107	if (confidence == 0 \|\| confidence == 100) {
				108	break;
				109	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	110	}
Jungshik Shin (jungshik at google)	0f8746a	2015-01-08 15:46:45 -0800	[diff] [blame]	111	if (bytesToCheck < 4 && confidence < 100) {
				112	confidence = 0;
				113	}
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	114	results->set(textIn, this, confidence);
				115	return (confidence > 0);
				116	}
				117
				118	CharsetRecog_UTF_32::~CharsetRecog_UTF_32()
				119	{
				120	// nothing to do
				121	}
				122
				123	UBool CharsetRecog_UTF_32::match(InputText* textIn, CharsetMatch *results) const
				124	{
				125	const uint8_t *input = textIn->fRawInput;
				126	int32_t limit = (textIn->fRawLength / 4) * 4;
				127	int32_t numValid = 0;
				128	int32_t numInvalid = 0;
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	129	bool hasBOM = false;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	130	int32_t confidence = 0;
				131
				132	if (limit > 0 && getChar(input, 0) == 0x0000FEFFUL) {
Frank Tang	1f164ee	2022-11-08 12:31:27 -0800	[diff] [blame^]	133	hasBOM = true;
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	134	}
				135
				136	for(int32_t i = 0; i < limit; i += 4) {
				137	int32_t ch = getChar(input, i);
				138
				139	if (ch < 0 \|\| ch >= 0x10FFFF \|\| (ch >= 0xD800 && ch <= 0xDFFF)) {
				140	numInvalid += 1;
				141	} else {
				142	numValid += 1;
				143	}
				144	}
				145
				146
Frank Tang	3e05d9d	2021-11-08 14:04:04 -0800	[diff] [blame]	147	// Cook up some sort of confidence score, based on presence of a BOM
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	148	// and the existence of valid and/or invalid multi-byte sequences.
				149	if (hasBOM && numInvalid==0) {
				150	confidence = 100;
				151	} else if (hasBOM && numValid > numInvalid*10) {
				152	confidence = 80;
				153	} else if (numValid > 3 && numInvalid == 0) {
				154	confidence = 100;
				155	} else if (numValid > 0 && numInvalid == 0) {
				156	confidence = 80;
				157	} else if (numValid > numInvalid*10) {
Frank Tang	7e7574b	2021-04-13 21:19:13 -0700	[diff] [blame]	158	// Probably corrupt UTF-32BE data. Valid sequences aren't likely by chance.
jshin@chromium.org	6f31ac3	2014-03-26 22:15:14 +0000	[diff] [blame]	159	confidence = 25;
				160	}
				161
				162	results->set(textIn, this, confidence);
				163	return (confidence > 0);
				164	}
				165
				166	CharsetRecog_UTF_32_BE::~CharsetRecog_UTF_32_BE()
				167	{
				168	// nothing to do
				169	}
				170
				171	const char *CharsetRecog_UTF_32_BE::getName() const
				172	{
				173	return "UTF-32BE";
				174	}
				175
				176	int32_t CharsetRecog_UTF_32_BE::getChar(const uint8_t *input, int32_t index) const
				177	{
				178	return input[index + 0] << 24 \| input[index + 1] << 16 \|
				179	input[index + 2] << 8 \| input[index + 3];
				180	}
				181
				182	CharsetRecog_UTF_32_LE::~CharsetRecog_UTF_32_LE()
				183	{
				184	// nothing to do
				185	}
				186
				187	const char *CharsetRecog_UTF_32_LE::getName() const
				188	{
				189	return "UTF-32LE";
				190	}
				191
				192	int32_t CharsetRecog_UTF_32_LE::getChar(const uint8_t *input, int32_t index) const
				193	{
				194	return input[index + 3] << 24 \| input[index + 2] << 16 \|
				195	input[index + 1] << 8 \| input[index + 0];
				196	}
				197
				198	U_NAMESPACE_END
				199	#endif
				200