Files
wnsrc/lua/includes/modules/utf8.lua
lifestorm 94063e4369 Upload
2024-08-04 22:55:00 +03:00

388 lines
7.7 KiB
Lua

--[[
| This file was obtained through the combined efforts
| of Madbluntz & Plymouth Antiquarian Society.
|
| Credits: lifestorm, Gregory Wayne Rossel JR.,
| Maloy, DrPepper10 @ RIP, Atle!
|
| Visit for more: https://plymouth.thetwilightzone.ru/
--]]
local bit = bit
local error = error
local ipairs = ipairs
local string = string
local table = table
local unpack = unpack
local math = math
module( "utf8" )
--
-- Pattern that can be used with the string library to match a single UTF-8 byte-sequence.
-- This expects the string to contain valid UTF-8 data.
--
charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
--
-- Transforms indexes of a string to be positive.
-- Negative indices will wrap around like the string library's functions.
--
local function strRelToAbs( str, ... )
local args = { ... }
for k, v in ipairs( args ) do
v = v > 0 and v or math.max( #str + v + 1, 1 )
if v < 1 or v > #str + 1 then
error( "bad index to string (out of range)", 3 )
end
args[ k ] = v
end
return unpack( args )
end
-- Decodes a single UTF-8 byte-sequence from a string, ensuring it is valid
-- Returns the index of the first/last chars of a sequence and its codepoint
--
local function decode( str, startPos )
startPos = strRelToAbs( str, startPos or 1 )
local b1 = str:byte( startPos, startPos )
-- End of string
if not b1 then
return nil
end
-- Single-byte sequence
if b1 < 0x80 then
return startPos, startPos, b1
end
-- Validate first byte of multi-byte sequence
if b1 > 0xF4 or b1 < 0xC2 then
return nil
end
-- Get 'supposed' amount of continuation bytes from primary byte
local contByteCount = b1 >= 0xF0 and 3 or
b1 >= 0xE0 and 2 or
b1 >= 0xC0 and 1
local endPos = startPos + contByteCount
local codePoint = 0
-- The string doesn't have enough data for this many continutation bytes
if #str < endPos then
return nil
end
-- Validate our continuation bytes
for _, bX in ipairs { str:byte( startPos + 1, endPos ) } do
-- Invalid continuation byte hit
if bit.band( bX, 0xC0 ) ~= 0x80 then
return nil
end
codePoint = bit.bor( bit.lshift( codePoint, 6 ), bit.band( bX, 0x3F ) )
b1 = bit.lshift( b1, 1 )
end
codePoint = bit.bor( codePoint, bit.lshift( bit.band( b1, 0x7F ), contByteCount * 5 ) )
return startPos, endPos, codePoint
end
--
-- Takes zero or more integers and returns a string containing the UTF-8 representation of each
--
function char( ... )
local buf = {}
for k, v in ipairs { ... } do
if v < 0 or v > 0x10FFFF then
error( "bad argument #" .. k .. " to char (out of range)", 2 )
end
local b1, b2, b3, b4 = nil, nil, nil, nil
if v < 0x80 then -- Single-byte sequence
table.insert( buf, string.char( v ) )
elseif v < 0x800 then -- Two-byte sequence
b1 = bit.bor( 0xC0, bit.band( bit.rshift( v, 6 ), 0x1F ) )
b2 = bit.bor( 0x80, bit.band( v, 0x3F ) )
table.insert( buf, string.char( b1, b2 ) )
elseif v < 0x10000 then -- Three-byte sequence
b1 = bit.bor( 0xE0, bit.band( bit.rshift( v, 12 ), 0x0F ) )
b2 = bit.bor( 0x80, bit.band( bit.rshift( v, 6 ), 0x3F ) )
b3 = bit.bor( 0x80, bit.band( v, 0x3F ) )
table.insert( buf, string.char( b1, b2, b3 ) )
else -- Four-byte sequence
b1 = bit.bor( 0xF0, bit.band( bit.rshift( v, 18 ), 0x07 ) )
b2 = bit.bor( 0x80, bit.band( bit.rshift( v, 12 ), 0x3F ) )
b3 = bit.bor( 0x80, bit.band( bit.rshift( v, 6 ), 0x3F ) )
b4 = bit.bor( 0x80, bit.band( v, 0x3F ) )
table.insert( buf, string.char( b1, b2, b3, b4 ) )
end
end
return table.concat( buf, "" )
end
--
-- Iterates over a UTF-8 string similarly to pairs
-- k = index of sequence, v = string value of sequence
--
function codes( str )
local i = 1
return function()
-- Have we hit the end of the iteration set?
if i > #str then
return nil
end
local startPos, endPos, codePoint = decode( str, i )
if not startPos then
error( "invalid UTF-8 code", 2 )
end
i = endPos + 1
return startPos, codePoint
end
end
--
-- Returns an integer-representation of the UTF-8 sequence(s) in a string
-- startPos defaults to 1, endPos defaults to startPos
--
function codepoint( str, startPos, endPos )
startPos, endPos = strRelToAbs( str, startPos or 1, endPos or startPos or 1 )
local ret = {}
repeat
local seqStartPos, seqEndPos, codePoint = decode( str, startPos )
if not seqStartPos then
error( "invalid UTF-8 code", 2 )
end
-- Increment current string index
startPos = seqEndPos + 1
table.insert( ret, codePoint )
until seqEndPos >= endPos
return unpack( ret )
end
--
-- Returns the length of a UTF-8 string. false, index is returned if an invalid sequence is hit
-- startPos defaults to 1, endPos defaults to -1
--
function len( str, startPos, endPos )
startPos, endPos = strRelToAbs( str, startPos or 1, endPos or -1 )
local len = 0
while endPos >= startPos and startPos <= #str do
local seqStartPos, seqEndPos = decode( str, startPos )
-- Hit an invalid sequence?
if not seqStartPos then
return false, startPos
end
-- Increment current string pointer
startPos = seqEndPos + 1
-- Increment length
len = len + 1
end
return len
end
--
-- Returns the byte-index of the n'th UTF-8-character after the given byte-index (nil if none)
-- startPos defaults to 1 when n is positive and -1 when n is negative
-- If 0 is zero, this function instead returns the byte-index of the UTF-8-character startPos lies within.
--
function offset( str, n, startPos )
if startPos and ( startPos > #str or -startPos > #str or startPos == 0 ) then
error( "bad index to string (out of range)", 2 )
end
local pos = ( n >= 0 ) and 1 or #str
pos = strRelToAbs( str, startPos or pos )
-- Back up to the start of this byte sequence
if n == 0 then
while pos > 0 and not decode( str, pos ) do
pos = pos - 1
end
return pos
end
--
-- Make sure we're on a valid sequence
--
if not decode( str, pos ) then
error( "initial position is a continuation byte", 2 )
end
-- Back up to (-n) byte sequences
if n < 0 then
for i = 1, -n do
pos = pos - 1
while pos > 0 and not decode( str, pos ) do
pos = pos - 1
end
end
if pos < 1 then
return nil
end
return pos
end
-- Jump forward (n) byte sequences
if n > 0 then
for i = 1, n do
pos = pos + 1
while pos <= #str and not decode( str, pos ) do
pos = pos + 1
end
end
if pos > #str then
return nil
end
return pos
end
end
--
-- Forces a string to contain only valid UTF-8 data.
-- Invalid sequences are replaced with U+FFFD.
--
function force( str )
local buf = {}
local curPos, endPos = 1, #str
-- Empty string?
if endPos == 0 then
return str
end
repeat
local seqStartPos, seqEndPos = decode( str, curPos )
if not seqStartPos then
table.insert( buf, char( 0xFFFD ) )
curPos = curPos + 1
else
table.insert( buf, str:sub( seqStartPos, seqEndPos ) )
curPos = seqEndPos + 1
end
until curPos > endPos
return table.concat( buf, "" )
end
--
-- Converts a relative index to an absolute
-- This is different from the above in that it cares about characters and not bytes
--
local function strRelToAbsChar( str, pos )
if pos < 0 then
pos = math.max( pos + len( str ) + 1, 0 )
end
return pos
end
--
-- UTF-8 compilant version of str[idx]
--
function GetChar( str, idx )
idx = strRelToAbsChar( str, idx )
if idx == 0 then return "" end
if idx > len( str ) then return "" end
local off = offset( str, idx - 1 )
return char( codepoint( str, off ) )
end
--
-- UTF-8 compilant version of string.sub
--
function sub( str, charstart, charend )
charstart = strRelToAbsChar( str, charstart )
charend = strRelToAbsChar( str, charend or -1 )
local buf = {}
for i = charstart, charend do
buf[#buf + 1] = GetChar( str, i )
end
return table.concat( buf )
end