mirror of
https://github.com/lifestorm/wnsrc.git
synced 2025-12-16 21:33:46 +03:00
388 lines
7.7 KiB
Lua
388 lines
7.7 KiB
Lua
--[[
|
|
| This file was obtained through the combined efforts
|
|
| of Madbluntz & Plymouth Antiquarian Society.
|
|
|
|
|
| Credits: lifestorm, Gregory Wayne Rossel JR.,
|
|
| Maloy, DrPepper10 @ RIP, Atle!
|
|
|
|
|
| Visit for more: https://plymouth.thetwilightzone.ru/
|
|
--]]
|
|
|
|
local bit = bit
|
|
local error = error
|
|
local ipairs = ipairs
|
|
local string = string
|
|
local table = table
|
|
local unpack = unpack
|
|
local math = math
|
|
|
|
module( "utf8" )
|
|
|
|
--
|
|
-- Pattern that can be used with the string library to match a single UTF-8 byte-sequence.
|
|
-- This expects the string to contain valid UTF-8 data.
|
|
--
|
|
charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
|
|
|
|
--
|
|
-- Transforms indexes of a string to be positive.
|
|
-- Negative indices will wrap around like the string library's functions.
|
|
--
|
|
local function strRelToAbs( str, ... )
|
|
|
|
local args = { ... }
|
|
|
|
for k, v in ipairs( args ) do
|
|
v = v > 0 and v or math.max( #str + v + 1, 1 )
|
|
|
|
if v < 1 or v > #str + 1 then
|
|
error( "bad index to string (out of range)", 3 )
|
|
end
|
|
|
|
args[ k ] = v
|
|
end
|
|
|
|
return unpack( args )
|
|
|
|
end
|
|
|
|
-- Decodes a single UTF-8 byte-sequence from a string, ensuring it is valid
|
|
-- Returns the index of the first/last chars of a sequence and its codepoint
|
|
--
|
|
local function decode( str, startPos )
|
|
|
|
startPos = strRelToAbs( str, startPos or 1 )
|
|
|
|
local b1 = str:byte( startPos, startPos )
|
|
|
|
-- End of string
|
|
if not b1 then
|
|
return nil
|
|
end
|
|
|
|
-- Single-byte sequence
|
|
if b1 < 0x80 then
|
|
return startPos, startPos, b1
|
|
end
|
|
|
|
-- Validate first byte of multi-byte sequence
|
|
if b1 > 0xF4 or b1 < 0xC2 then
|
|
return nil
|
|
end
|
|
|
|
-- Get 'supposed' amount of continuation bytes from primary byte
|
|
local contByteCount = b1 >= 0xF0 and 3 or
|
|
b1 >= 0xE0 and 2 or
|
|
b1 >= 0xC0 and 1
|
|
|
|
local endPos = startPos + contByteCount
|
|
local codePoint = 0
|
|
|
|
-- The string doesn't have enough data for this many continutation bytes
|
|
if #str < endPos then
|
|
return nil
|
|
end
|
|
|
|
-- Validate our continuation bytes
|
|
for _, bX in ipairs { str:byte( startPos + 1, endPos ) } do
|
|
|
|
-- Invalid continuation byte hit
|
|
if bit.band( bX, 0xC0 ) ~= 0x80 then
|
|
return nil
|
|
end
|
|
|
|
codePoint = bit.bor( bit.lshift( codePoint, 6 ), bit.band( bX, 0x3F ) )
|
|
b1 = bit.lshift( b1, 1 )
|
|
|
|
end
|
|
|
|
codePoint = bit.bor( codePoint, bit.lshift( bit.band( b1, 0x7F ), contByteCount * 5 ) )
|
|
|
|
return startPos, endPos, codePoint
|
|
|
|
end
|
|
|
|
--
|
|
-- Takes zero or more integers and returns a string containing the UTF-8 representation of each
|
|
--
|
|
function char( ... )
|
|
|
|
local buf = {}
|
|
|
|
for k, v in ipairs { ... } do
|
|
|
|
if v < 0 or v > 0x10FFFF then
|
|
error( "bad argument #" .. k .. " to char (out of range)", 2 )
|
|
end
|
|
|
|
local b1, b2, b3, b4 = nil, nil, nil, nil
|
|
|
|
if v < 0x80 then -- Single-byte sequence
|
|
|
|
table.insert( buf, string.char( v ) )
|
|
|
|
elseif v < 0x800 then -- Two-byte sequence
|
|
|
|
b1 = bit.bor( 0xC0, bit.band( bit.rshift( v, 6 ), 0x1F ) )
|
|
b2 = bit.bor( 0x80, bit.band( v, 0x3F ) )
|
|
|
|
table.insert( buf, string.char( b1, b2 ) )
|
|
|
|
elseif v < 0x10000 then -- Three-byte sequence
|
|
|
|
b1 = bit.bor( 0xE0, bit.band( bit.rshift( v, 12 ), 0x0F ) )
|
|
b2 = bit.bor( 0x80, bit.band( bit.rshift( v, 6 ), 0x3F ) )
|
|
b3 = bit.bor( 0x80, bit.band( v, 0x3F ) )
|
|
|
|
table.insert( buf, string.char( b1, b2, b3 ) )
|
|
|
|
else -- Four-byte sequence
|
|
|
|
b1 = bit.bor( 0xF0, bit.band( bit.rshift( v, 18 ), 0x07 ) )
|
|
b2 = bit.bor( 0x80, bit.band( bit.rshift( v, 12 ), 0x3F ) )
|
|
b3 = bit.bor( 0x80, bit.band( bit.rshift( v, 6 ), 0x3F ) )
|
|
b4 = bit.bor( 0x80, bit.band( v, 0x3F ) )
|
|
|
|
table.insert( buf, string.char( b1, b2, b3, b4 ) )
|
|
|
|
end
|
|
|
|
end
|
|
|
|
return table.concat( buf, "" )
|
|
|
|
end
|
|
|
|
--
|
|
-- Iterates over a UTF-8 string similarly to pairs
|
|
-- k = index of sequence, v = string value of sequence
|
|
--
|
|
function codes( str )
|
|
|
|
local i = 1
|
|
|
|
return function()
|
|
|
|
-- Have we hit the end of the iteration set?
|
|
if i > #str then
|
|
return nil
|
|
end
|
|
|
|
local startPos, endPos, codePoint = decode( str, i )
|
|
|
|
if not startPos then
|
|
error( "invalid UTF-8 code", 2 )
|
|
end
|
|
|
|
i = endPos + 1
|
|
|
|
return startPos, codePoint
|
|
|
|
end
|
|
|
|
end
|
|
|
|
--
|
|
-- Returns an integer-representation of the UTF-8 sequence(s) in a string
|
|
-- startPos defaults to 1, endPos defaults to startPos
|
|
--
|
|
function codepoint( str, startPos, endPos )
|
|
|
|
startPos, endPos = strRelToAbs( str, startPos or 1, endPos or startPos or 1 )
|
|
|
|
local ret = {}
|
|
|
|
repeat
|
|
local seqStartPos, seqEndPos, codePoint = decode( str, startPos )
|
|
|
|
if not seqStartPos then
|
|
error( "invalid UTF-8 code", 2 )
|
|
end
|
|
|
|
-- Increment current string index
|
|
startPos = seqEndPos + 1
|
|
|
|
table.insert( ret, codePoint )
|
|
until seqEndPos >= endPos
|
|
|
|
return unpack( ret )
|
|
|
|
end
|
|
|
|
--
|
|
-- Returns the length of a UTF-8 string. false, index is returned if an invalid sequence is hit
|
|
-- startPos defaults to 1, endPos defaults to -1
|
|
--
|
|
function len( str, startPos, endPos )
|
|
|
|
startPos, endPos = strRelToAbs( str, startPos or 1, endPos or -1 )
|
|
|
|
local len = 0
|
|
|
|
while endPos >= startPos and startPos <= #str do
|
|
local seqStartPos, seqEndPos = decode( str, startPos )
|
|
|
|
-- Hit an invalid sequence?
|
|
if not seqStartPos then
|
|
return false, startPos
|
|
end
|
|
|
|
-- Increment current string pointer
|
|
startPos = seqEndPos + 1
|
|
|
|
-- Increment length
|
|
len = len + 1
|
|
end
|
|
|
|
return len
|
|
|
|
end
|
|
|
|
--
|
|
-- Returns the byte-index of the n'th UTF-8-character after the given byte-index (nil if none)
|
|
-- startPos defaults to 1 when n is positive and -1 when n is negative
|
|
-- If 0 is zero, this function instead returns the byte-index of the UTF-8-character startPos lies within.
|
|
--
|
|
function offset( str, n, startPos )
|
|
|
|
if startPos and ( startPos > #str or -startPos > #str or startPos == 0 ) then
|
|
error( "bad index to string (out of range)", 2 )
|
|
end
|
|
|
|
local pos = ( n >= 0 ) and 1 or #str
|
|
pos = strRelToAbs( str, startPos or pos )
|
|
|
|
-- Back up to the start of this byte sequence
|
|
if n == 0 then
|
|
|
|
while pos > 0 and not decode( str, pos ) do
|
|
pos = pos - 1
|
|
end
|
|
|
|
return pos
|
|
|
|
end
|
|
|
|
--
|
|
-- Make sure we're on a valid sequence
|
|
--
|
|
if not decode( str, pos ) then
|
|
error( "initial position is a continuation byte", 2 )
|
|
end
|
|
|
|
-- Back up to (-n) byte sequences
|
|
if n < 0 then
|
|
|
|
for i = 1, -n do
|
|
pos = pos - 1
|
|
|
|
while pos > 0 and not decode( str, pos ) do
|
|
pos = pos - 1
|
|
end
|
|
end
|
|
|
|
if pos < 1 then
|
|
return nil
|
|
end
|
|
|
|
return pos
|
|
|
|
end
|
|
|
|
-- Jump forward (n) byte sequences
|
|
if n > 0 then
|
|
|
|
for i = 1, n do
|
|
pos = pos + 1
|
|
|
|
while pos <= #str and not decode( str, pos ) do
|
|
pos = pos + 1
|
|
end
|
|
end
|
|
|
|
if pos > #str then
|
|
return nil
|
|
end
|
|
|
|
return pos
|
|
|
|
end
|
|
|
|
end
|
|
|
|
--
|
|
-- Forces a string to contain only valid UTF-8 data.
|
|
-- Invalid sequences are replaced with U+FFFD.
|
|
--
|
|
function force( str )
|
|
|
|
local buf = {}
|
|
|
|
local curPos, endPos = 1, #str
|
|
|
|
-- Empty string?
|
|
if endPos == 0 then
|
|
return str
|
|
end
|
|
|
|
repeat
|
|
|
|
local seqStartPos, seqEndPos = decode( str, curPos )
|
|
|
|
if not seqStartPos then
|
|
|
|
table.insert( buf, char( 0xFFFD ) )
|
|
curPos = curPos + 1
|
|
|
|
else
|
|
|
|
table.insert( buf, str:sub( seqStartPos, seqEndPos ) )
|
|
curPos = seqEndPos + 1
|
|
|
|
end
|
|
|
|
until curPos > endPos
|
|
|
|
return table.concat( buf, "" )
|
|
|
|
end
|
|
|
|
--
|
|
-- Converts a relative index to an absolute
|
|
-- This is different from the above in that it cares about characters and not bytes
|
|
--
|
|
local function strRelToAbsChar( str, pos )
|
|
if pos < 0 then
|
|
pos = math.max( pos + len( str ) + 1, 0 )
|
|
end
|
|
return pos
|
|
end
|
|
|
|
--
|
|
-- UTF-8 compilant version of str[idx]
|
|
--
|
|
function GetChar( str, idx )
|
|
idx = strRelToAbsChar( str, idx )
|
|
|
|
if idx == 0 then return "" end
|
|
if idx > len( str ) then return "" end
|
|
|
|
local off = offset( str, idx - 1 )
|
|
return char( codepoint( str, off ) )
|
|
end
|
|
|
|
--
|
|
-- UTF-8 compilant version of string.sub
|
|
--
|
|
function sub( str, charstart, charend )
|
|
charstart = strRelToAbsChar( str, charstart )
|
|
charend = strRelToAbsChar( str, charend or -1 )
|
|
|
|
local buf = {}
|
|
for i = charstart, charend do
|
|
buf[#buf + 1] = GetChar( str, i )
|
|
end
|
|
|
|
return table.concat( buf )
|
|
end
|