/*
 * Copyright 2015-2025 MicroEJ Corp.
 * Use of this source code is governed by a BSD-style license that can be found with this software.
 */
package ej.websocket.util;

import ej.bon.Constants;

/**
 * Utility class to check the validity of an UTF8 message.
 */
public class UTF8Validator {

	public static final String CHECK_UTF8_VALIDITY = "ej.websocket.utf8.validation.enabled";

	private UTF8Validator() {
		// Forbid instantiation.
	}

	/**
	 * Checks the validity of UTF8 array of bytes.
	 *
	 * @param bytes
	 *            the bytes array to check.
	 * @param offset
	 *            the offset to start the validity.
	 * @param length
	 *            the length to check.
	 * @return true if the array contains only valid UTF8 code.
	 */
	public static boolean isValid(final byte[] bytes, final int offset, final int length) {
		if (!Constants.getBoolean(CHECK_UTF8_VALIDITY)) {
			return true;
		}
		int totalLength = offset + length;
		if (checkBounds(bytes, offset, length, totalLength)) {
			return false;
		}
		// Number of bytes in the current UTF-8 character
		int numberOfBytesToProcess = 0;
		int firstByte = 0;

		// For each byte.
		for (int i = offset; i < totalLength; i++) {

			int b = bytes[i] & 0xFF;
			if (numberOfBytesToProcess <= 0) {

				numberOfBytesToProcess = computeNumberOfBytesToProcess(b, numberOfBytesToProcess);
				if (numberOfBytesToProcess <= 0 || numberOfBytesToProcess > totalLength - i) {
					return false;
				}
				firstByte = b;

			} else {
				boolean isInvalidbyte = isInvalidByte(b, numberOfBytesToProcess, firstByte);
				if (isInvalidbyte) {
					return false;
				}
			}

			// We reduce the number of bytes to process by 1 after each integer.
			numberOfBytesToProcess--;
		}

		// This is for the case where we might not have the complete data for
		// a particular UTF-8 character.
		return numberOfBytesToProcess == 0;
	}

	private static int computeNumberOfBytesToProcess(int b, int numberOfBytesToProcess) {
		// Gets the size of the Unicode.
		if ((b & 0b11110000) == 0b11110000) {
			// UTF8 cannot be higher than U+3FFFF
			if (b > 0b11110100) {
				return numberOfBytesToProcess;
			}
			return 4;
		} else if ((b & 0b11100000) == 0b11100000) {
			return 3;
		} else if ((b & 0b11000000) == 0b11000000) {
			if (b < 0b11000010) {
				return numberOfBytesToProcess;
			}
			return 2;
		} else if ((b & 0b10000000) == 0b00000000) {
			return 1;
		} else {
			// Invalid first code.
			return numberOfBytesToProcess;
		}
	}

	private static boolean isInvalidByte(int b, int numberOfBytesToProcess, int firstByte) {
		// Else, we are processing integers which represent bytes which are a part of
		// a UTF-8 character. So, they must adhere to the pattern `10xxxxxx`.
		boolean isInvalidbyte = (b & 0b11000000) != 0b10000000;
		// Specific case U+0800 to U+0FFF 11100000 101xxxxx 10xxxxxx
		isInvalidbyte |= firstByte == 0b11100000 && numberOfBytesToProcess == 2 && (b & 0b11100000) != 0b10100000;
		// Specific case U+D000 to U+D7FF 11101101 100xxxxx 10xxxxxx
		isInvalidbyte |= firstByte == 0b11101101 && numberOfBytesToProcess == 2 && (b & 0b11100000) != 0b10000000;
		// Specific case U+10000 to U+1FFFF 11110000 1001xxxx 10xxxxxx 10xxxxxx
		// Specific case U+20000 to U+3FFFF 11110000 101xxxxx 10xxxxxx 10xxxxxx
		isInvalidbyte |= firstByte == 0b11110000 && numberOfBytesToProcess == 3
				&& !((b & 0b11110000) == 0b10010000 || (b & 0b11100000) == 0b10100000);
		isInvalidbyte |= firstByte == 0b11110100 && numberOfBytesToProcess == 3 && (b & 0b11110000) != 0b10000000;
		return isInvalidbyte;
	}

	private static boolean checkBounds(final byte[] bytes, final int offset, final int length, int totalLength) {
		return totalLength > bytes.length || offset < 0 || length < 0;
	}

}
