wnsrc/lua/includes/modules/utf8.lua

--[[
| This file was obtained through the combined efforts
| of Madbluntz & Plymouth Antiquarian Society.
|
| Credits: lifestorm, Gregory Wayne Rossel JR.,
| 	Maloy, DrPepper10 @ RIP, Atle!
|
| Visit for more: https://plymouth.thetwilightzone.ru/
--]]

local bit		= bit
local error		= error
local ipairs	= ipairs
local string	= string
local table		= table
local unpack	= unpack
local math		= math

module( "utf8" )

--
-- Pattern that can be used with the string library to match a single UTF-8 byte-sequence.
-- This expects the string to contain valid UTF-8 data.
--
charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"

--
-- Transforms indexes of a string to be positive.
-- Negative indices will wrap around like the string library's functions.
--
local function strRelToAbs( str, ... )

	local args = { ... }

	for k, v in ipairs( args ) do
		v = v > 0 and v or math.max( #str + v + 1, 1 )

		if v < 1 or v > #str + 1 then
			error( "bad index to string (out of range)", 3 )
		end

		args[ k ] = v
	end

	return unpack( args )

end

-- Decodes a single UTF-8 byte-sequence from a string, ensuring it is valid
-- Returns the index of the first/last chars of a sequence and its codepoint
--
local function decode( str, startPos )

	startPos = strRelToAbs( str, startPos or 1 )

	local b1 = str:byte( startPos, startPos )

	-- End of string
	if not b1 then
		return nil
	end

	-- Single-byte sequence
	if b1 < 0x80 then
		return startPos, startPos, b1
	end

	-- Validate first byte of multi-byte sequence
	if b1 > 0xF4 or b1 < 0xC2 then
		return nil
	end

	-- Get 'supposed' amount of continuation bytes from primary byte
	local contByteCount =	b1 >= 0xF0 and 3 or
							b1 >= 0xE0 and 2 or
							b1 >= 0xC0 and 1

	local endPos    = startPos + contByteCount
	local codePoint = 0

	-- The string doesn't have enough data for this many continutation bytes
	if #str < endPos then
		return nil
	end

	-- Validate our continuation bytes
	for _, bX in ipairs { str:byte( startPos + 1, endPos ) } do

		-- Invalid continuation byte hit
		if bit.band( bX, 0xC0 ) ~= 0x80 then
			return nil
		end

		codePoint = bit.bor( bit.lshift( codePoint, 6 ), bit.band( bX, 0x3F ) )
		b1 = bit.lshift( b1, 1 )

	end

	codePoint = bit.bor( codePoint, bit.lshift( bit.band( b1, 0x7F ), contByteCount * 5 ) )

	return startPos, endPos, codePoint

end

--
-- Takes zero or more integers and returns a string containing the UTF-8 representation of each
--
function char( ... )

	local buf = {}

	for k, v in ipairs { ... } do

		if v < 0 or v > 0x10FFFF then
			error( "bad argument #" .. k .. " to char (out of range)", 2 )
		end

		local b1, b2, b3, b4 = nil, nil, nil, nil

		if v < 0x80 then -- Single-byte sequence

			table.insert( buf, string.char( v ) )

		elseif v < 0x800 then -- Two-byte sequence

			b1 = bit.bor( 0xC0, bit.band( bit.rshift( v, 6 ), 0x1F ) )
			b2 = bit.bor( 0x80, bit.band( v, 0x3F ) )

			table.insert( buf, string.char( b1, b2 ) )

		elseif v < 0x10000 then -- Three-byte sequence

			b1 = bit.bor( 0xE0, bit.band( bit.rshift( v, 12 ), 0x0F ) )
			b2 = bit.bor( 0x80, bit.band( bit.rshift( v, 6 ), 0x3F ) )
			b3 = bit.bor( 0x80, bit.band( v, 0x3F ) )

			table.insert( buf, string.char( b1, b2, b3 ) )

		else -- Four-byte sequence

			b1 = bit.bor( 0xF0, bit.band( bit.rshift( v, 18 ), 0x07 ) )
			b2 = bit.bor( 0x80, bit.band( bit.rshift( v, 12 ), 0x3F ) )
			b3 = bit.bor( 0x80, bit.band( bit.rshift( v, 6 ), 0x3F ) )
			b4 = bit.bor( 0x80, bit.band( v, 0x3F ) )

			table.insert( buf, string.char( b1, b2, b3, b4 ) )

		end

	end

	return table.concat( buf, "" )

end

--
-- Iterates over a UTF-8 string similarly to pairs
-- k = index of sequence, v = string value of sequence
--
function codes( str )

	local i = 1

	return function()

		-- Have we hit the end of the iteration set?
		if i > #str then
			return nil
		end

		local startPos, endPos, codePoint = decode( str, i )

		if not startPos then
			error( "invalid UTF-8 code", 2 )
		end

		i = endPos + 1

		return startPos, codePoint

	end

end

--
-- Returns an integer-representation of the UTF-8 sequence(s) in a string
-- startPos defaults to 1, endPos defaults to startPos
--
function codepoint( str, startPos, endPos )

	startPos, endPos = strRelToAbs( str, startPos or 1, endPos or startPos or 1 )

	local ret = {}

	repeat
		local seqStartPos, seqEndPos, codePoint = decode( str, startPos )

		if not seqStartPos then
			error( "invalid UTF-8 code", 2 )
		end

		-- Increment current string index
		startPos = seqEndPos + 1

		table.insert( ret, codePoint )
	until seqEndPos >= endPos

	return unpack( ret )

end

--
-- Returns the length of a UTF-8 string. false, index is returned if an invalid sequence is hit
-- startPos defaults to 1, endPos defaults to -1
--
function len( str, startPos, endPos )

	startPos, endPos = strRelToAbs( str, startPos or 1, endPos or -1 )

	local len = 0

	while endPos >= startPos and startPos <= #str do
		local seqStartPos, seqEndPos = decode( str, startPos )

		-- Hit an invalid sequence?
		if not seqStartPos then
			return false, startPos
		end

		-- Increment current string pointer
		startPos = seqEndPos + 1

		-- Increment length
		len = len + 1
	end

	return len

end

--
-- Returns the byte-index of the n'th UTF-8-character after the given byte-index (nil if none)
-- startPos defaults to 1 when n is positive and -1 when n is negative
-- If 0 is zero, this function instead returns the byte-index of the UTF-8-character startPos lies within.
--
function offset( str, n, startPos )

	if startPos and ( startPos > #str or -startPos > #str or startPos == 0 ) then
		error( "bad index to string (out of range)", 2 )
	end

	local pos = ( n >= 0 ) and 1 or #str
	      pos = strRelToAbs( str, startPos or pos )

	-- Back up to the start of this byte sequence
	if n == 0 then

		while pos > 0 and not decode( str, pos ) do
			pos = pos - 1
		end

		return pos

	end

	--
	-- Make sure we're on a valid sequence
	--
	if not decode( str, pos ) then
		error( "initial position is a continuation byte", 2 )
	end

	-- Back up to (-n) byte sequences
	if n < 0 then

		for i = 1, -n do
			pos = pos - 1

			while pos > 0 and not decode( str, pos ) do
				pos = pos - 1
			end
		end

		if pos < 1 then
			return nil
		end

		return pos

	end

	-- Jump forward (n) byte sequences
	if n > 0 then

		for i = 1, n do
			pos = pos + 1

			while pos <= #str and not decode( str, pos ) do
				pos = pos + 1
			end
		end

		if pos > #str then
			return nil
		end

		return pos

	end

end

--
-- Forces a string to contain only valid UTF-8 data.
-- Invalid sequences are replaced with U+FFFD.
--
function force( str )

	local buf = {}

	local curPos, endPos = 1, #str

	-- Empty string?
	if endPos == 0 then
		return str
	end

	repeat

		local seqStartPos, seqEndPos = decode( str, curPos )

		if not seqStartPos then

			table.insert( buf, char( 0xFFFD ) )
			curPos = curPos + 1

		else

			table.insert( buf, str:sub( seqStartPos, seqEndPos ) )
			curPos = seqEndPos + 1

		end

	until curPos > endPos

	return table.concat( buf, "" )

end

--
-- Converts a relative index to an absolute
-- This is different from the above in that it cares about characters and not bytes
--
local function strRelToAbsChar( str, pos )
	if pos < 0 then
		pos = math.max( pos + len( str ) + 1, 0 )
	end
	return pos
end

--
-- UTF-8 compilant version of str[idx]
--
function GetChar( str, idx )
	idx = strRelToAbsChar( str, idx )

	if idx == 0 then return "" end
	if idx > len( str ) then return "" end

	local off = offset( str, idx - 1 )
	return char( codepoint( str, off ) )
end

--
-- UTF-8 compilant version of string.sub
--
function sub( str, charstart, charend )
	charstart = strRelToAbsChar( str, charstart )
	charend = strRelToAbsChar( str, charend or -1 )

	local buf = {}
	for i = charstart, charend do
		buf[#buf + 1] = GetChar( str, i )
	end

	return table.concat( buf )
end