mirror of
https://github.com/lifestorm/wnsrc.git
synced 2025-12-17 13:53:45 +03:00
Upload
This commit is contained in:
387
lua/includes/modules/utf8.lua
Normal file
387
lua/includes/modules/utf8.lua
Normal file
@@ -0,0 +1,387 @@
|
||||
--[[
|
||||
| This file was obtained through the combined efforts
|
||||
| of Madbluntz & Plymouth Antiquarian Society.
|
||||
|
|
||||
| Credits: lifestorm, Gregory Wayne Rossel JR.,
|
||||
| Maloy, DrPepper10 @ RIP, Atle!
|
||||
|
|
||||
| Visit for more: https://plymouth.thetwilightzone.ru/
|
||||
--]]
|
||||
|
||||
local bit = bit
|
||||
local error = error
|
||||
local ipairs = ipairs
|
||||
local string = string
|
||||
local table = table
|
||||
local unpack = unpack
|
||||
local math = math
|
||||
|
||||
module( "utf8" )
|
||||
|
||||
--
|
||||
-- Pattern that can be used with the string library to match a single UTF-8 byte-sequence.
|
||||
-- This expects the string to contain valid UTF-8 data.
|
||||
--
|
||||
charpattern = "[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
|
||||
|
||||
--
|
||||
-- Transforms indexes of a string to be positive.
|
||||
-- Negative indices will wrap around like the string library's functions.
|
||||
--
|
||||
local function strRelToAbs( str, ... )
|
||||
|
||||
local args = { ... }
|
||||
|
||||
for k, v in ipairs( args ) do
|
||||
v = v > 0 and v or math.max( #str + v + 1, 1 )
|
||||
|
||||
if v < 1 or v > #str + 1 then
|
||||
error( "bad index to string (out of range)", 3 )
|
||||
end
|
||||
|
||||
args[ k ] = v
|
||||
end
|
||||
|
||||
return unpack( args )
|
||||
|
||||
end
|
||||
|
||||
-- Decodes a single UTF-8 byte-sequence from a string, ensuring it is valid
|
||||
-- Returns the index of the first/last chars of a sequence and its codepoint
|
||||
--
|
||||
local function decode( str, startPos )
|
||||
|
||||
startPos = strRelToAbs( str, startPos or 1 )
|
||||
|
||||
local b1 = str:byte( startPos, startPos )
|
||||
|
||||
-- End of string
|
||||
if not b1 then
|
||||
return nil
|
||||
end
|
||||
|
||||
-- Single-byte sequence
|
||||
if b1 < 0x80 then
|
||||
return startPos, startPos, b1
|
||||
end
|
||||
|
||||
-- Validate first byte of multi-byte sequence
|
||||
if b1 > 0xF4 or b1 < 0xC2 then
|
||||
return nil
|
||||
end
|
||||
|
||||
-- Get 'supposed' amount of continuation bytes from primary byte
|
||||
local contByteCount = b1 >= 0xF0 and 3 or
|
||||
b1 >= 0xE0 and 2 or
|
||||
b1 >= 0xC0 and 1
|
||||
|
||||
local endPos = startPos + contByteCount
|
||||
local codePoint = 0
|
||||
|
||||
-- The string doesn't have enough data for this many continutation bytes
|
||||
if #str < endPos then
|
||||
return nil
|
||||
end
|
||||
|
||||
-- Validate our continuation bytes
|
||||
for _, bX in ipairs { str:byte( startPos + 1, endPos ) } do
|
||||
|
||||
-- Invalid continuation byte hit
|
||||
if bit.band( bX, 0xC0 ) ~= 0x80 then
|
||||
return nil
|
||||
end
|
||||
|
||||
codePoint = bit.bor( bit.lshift( codePoint, 6 ), bit.band( bX, 0x3F ) )
|
||||
b1 = bit.lshift( b1, 1 )
|
||||
|
||||
end
|
||||
|
||||
codePoint = bit.bor( codePoint, bit.lshift( bit.band( b1, 0x7F ), contByteCount * 5 ) )
|
||||
|
||||
return startPos, endPos, codePoint
|
||||
|
||||
end
|
||||
|
||||
--
|
||||
-- Takes zero or more integers and returns a string containing the UTF-8 representation of each
|
||||
--
|
||||
function char( ... )
|
||||
|
||||
local buf = {}
|
||||
|
||||
for k, v in ipairs { ... } do
|
||||
|
||||
if v < 0 or v > 0x10FFFF then
|
||||
error( "bad argument #" .. k .. " to char (out of range)", 2 )
|
||||
end
|
||||
|
||||
local b1, b2, b3, b4 = nil, nil, nil, nil
|
||||
|
||||
if v < 0x80 then -- Single-byte sequence
|
||||
|
||||
table.insert( buf, string.char( v ) )
|
||||
|
||||
elseif v < 0x800 then -- Two-byte sequence
|
||||
|
||||
b1 = bit.bor( 0xC0, bit.band( bit.rshift( v, 6 ), 0x1F ) )
|
||||
b2 = bit.bor( 0x80, bit.band( v, 0x3F ) )
|
||||
|
||||
table.insert( buf, string.char( b1, b2 ) )
|
||||
|
||||
elseif v < 0x10000 then -- Three-byte sequence
|
||||
|
||||
b1 = bit.bor( 0xE0, bit.band( bit.rshift( v, 12 ), 0x0F ) )
|
||||
b2 = bit.bor( 0x80, bit.band( bit.rshift( v, 6 ), 0x3F ) )
|
||||
b3 = bit.bor( 0x80, bit.band( v, 0x3F ) )
|
||||
|
||||
table.insert( buf, string.char( b1, b2, b3 ) )
|
||||
|
||||
else -- Four-byte sequence
|
||||
|
||||
b1 = bit.bor( 0xF0, bit.band( bit.rshift( v, 18 ), 0x07 ) )
|
||||
b2 = bit.bor( 0x80, bit.band( bit.rshift( v, 12 ), 0x3F ) )
|
||||
b3 = bit.bor( 0x80, bit.band( bit.rshift( v, 6 ), 0x3F ) )
|
||||
b4 = bit.bor( 0x80, bit.band( v, 0x3F ) )
|
||||
|
||||
table.insert( buf, string.char( b1, b2, b3, b4 ) )
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
return table.concat( buf, "" )
|
||||
|
||||
end
|
||||
|
||||
--
|
||||
-- Iterates over a UTF-8 string similarly to pairs
|
||||
-- k = index of sequence, v = string value of sequence
|
||||
--
|
||||
function codes( str )
|
||||
|
||||
local i = 1
|
||||
|
||||
return function()
|
||||
|
||||
-- Have we hit the end of the iteration set?
|
||||
if i > #str then
|
||||
return nil
|
||||
end
|
||||
|
||||
local startPos, endPos, codePoint = decode( str, i )
|
||||
|
||||
if not startPos then
|
||||
error( "invalid UTF-8 code", 2 )
|
||||
end
|
||||
|
||||
i = endPos + 1
|
||||
|
||||
return startPos, codePoint
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
--
|
||||
-- Returns an integer-representation of the UTF-8 sequence(s) in a string
|
||||
-- startPos defaults to 1, endPos defaults to startPos
|
||||
--
|
||||
function codepoint( str, startPos, endPos )
|
||||
|
||||
startPos, endPos = strRelToAbs( str, startPos or 1, endPos or startPos or 1 )
|
||||
|
||||
local ret = {}
|
||||
|
||||
repeat
|
||||
local seqStartPos, seqEndPos, codePoint = decode( str, startPos )
|
||||
|
||||
if not seqStartPos then
|
||||
error( "invalid UTF-8 code", 2 )
|
||||
end
|
||||
|
||||
-- Increment current string index
|
||||
startPos = seqEndPos + 1
|
||||
|
||||
table.insert( ret, codePoint )
|
||||
until seqEndPos >= endPos
|
||||
|
||||
return unpack( ret )
|
||||
|
||||
end
|
||||
|
||||
--
|
||||
-- Returns the length of a UTF-8 string. false, index is returned if an invalid sequence is hit
|
||||
-- startPos defaults to 1, endPos defaults to -1
|
||||
--
|
||||
function len( str, startPos, endPos )
|
||||
|
||||
startPos, endPos = strRelToAbs( str, startPos or 1, endPos or -1 )
|
||||
|
||||
local len = 0
|
||||
|
||||
while endPos >= startPos and startPos <= #str do
|
||||
local seqStartPos, seqEndPos = decode( str, startPos )
|
||||
|
||||
-- Hit an invalid sequence?
|
||||
if not seqStartPos then
|
||||
return false, startPos
|
||||
end
|
||||
|
||||
-- Increment current string pointer
|
||||
startPos = seqEndPos + 1
|
||||
|
||||
-- Increment length
|
||||
len = len + 1
|
||||
end
|
||||
|
||||
return len
|
||||
|
||||
end
|
||||
|
||||
--
|
||||
-- Returns the byte-index of the n'th UTF-8-character after the given byte-index (nil if none)
|
||||
-- startPos defaults to 1 when n is positive and -1 when n is negative
|
||||
-- If 0 is zero, this function instead returns the byte-index of the UTF-8-character startPos lies within.
|
||||
--
|
||||
function offset( str, n, startPos )
|
||||
|
||||
if startPos and ( startPos > #str or -startPos > #str or startPos == 0 ) then
|
||||
error( "bad index to string (out of range)", 2 )
|
||||
end
|
||||
|
||||
local pos = ( n >= 0 ) and 1 or #str
|
||||
pos = strRelToAbs( str, startPos or pos )
|
||||
|
||||
-- Back up to the start of this byte sequence
|
||||
if n == 0 then
|
||||
|
||||
while pos > 0 and not decode( str, pos ) do
|
||||
pos = pos - 1
|
||||
end
|
||||
|
||||
return pos
|
||||
|
||||
end
|
||||
|
||||
--
|
||||
-- Make sure we're on a valid sequence
|
||||
--
|
||||
if not decode( str, pos ) then
|
||||
error( "initial position is a continuation byte", 2 )
|
||||
end
|
||||
|
||||
-- Back up to (-n) byte sequences
|
||||
if n < 0 then
|
||||
|
||||
for i = 1, -n do
|
||||
pos = pos - 1
|
||||
|
||||
while pos > 0 and not decode( str, pos ) do
|
||||
pos = pos - 1
|
||||
end
|
||||
end
|
||||
|
||||
if pos < 1 then
|
||||
return nil
|
||||
end
|
||||
|
||||
return pos
|
||||
|
||||
end
|
||||
|
||||
-- Jump forward (n) byte sequences
|
||||
if n > 0 then
|
||||
|
||||
for i = 1, n do
|
||||
pos = pos + 1
|
||||
|
||||
while pos <= #str and not decode( str, pos ) do
|
||||
pos = pos + 1
|
||||
end
|
||||
end
|
||||
|
||||
if pos > #str then
|
||||
return nil
|
||||
end
|
||||
|
||||
return pos
|
||||
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
--
|
||||
-- Forces a string to contain only valid UTF-8 data.
|
||||
-- Invalid sequences are replaced with U+FFFD.
|
||||
--
|
||||
function force( str )
|
||||
|
||||
local buf = {}
|
||||
|
||||
local curPos, endPos = 1, #str
|
||||
|
||||
-- Empty string?
|
||||
if endPos == 0 then
|
||||
return str
|
||||
end
|
||||
|
||||
repeat
|
||||
|
||||
local seqStartPos, seqEndPos = decode( str, curPos )
|
||||
|
||||
if not seqStartPos then
|
||||
|
||||
table.insert( buf, char( 0xFFFD ) )
|
||||
curPos = curPos + 1
|
||||
|
||||
else
|
||||
|
||||
table.insert( buf, str:sub( seqStartPos, seqEndPos ) )
|
||||
curPos = seqEndPos + 1
|
||||
|
||||
end
|
||||
|
||||
until curPos > endPos
|
||||
|
||||
return table.concat( buf, "" )
|
||||
|
||||
end
|
||||
|
||||
--
|
||||
-- Converts a relative index to an absolute
|
||||
-- This is different from the above in that it cares about characters and not bytes
|
||||
--
|
||||
local function strRelToAbsChar( str, pos )
|
||||
if pos < 0 then
|
||||
pos = math.max( pos + len( str ) + 1, 0 )
|
||||
end
|
||||
return pos
|
||||
end
|
||||
|
||||
--
|
||||
-- UTF-8 compilant version of str[idx]
|
||||
--
|
||||
function GetChar( str, idx )
|
||||
idx = strRelToAbsChar( str, idx )
|
||||
|
||||
if idx == 0 then return "" end
|
||||
if idx > len( str ) then return "" end
|
||||
|
||||
local off = offset( str, idx - 1 )
|
||||
return char( codepoint( str, off ) )
|
||||
end
|
||||
|
||||
--
|
||||
-- UTF-8 compilant version of string.sub
|
||||
--
|
||||
function sub( str, charstart, charend )
|
||||
charstart = strRelToAbsChar( str, charstart )
|
||||
charend = strRelToAbsChar( str, charend or -1 )
|
||||
|
||||
local buf = {}
|
||||
for i = charstart, charend do
|
||||
buf[#buf + 1] = GetChar( str, i )
|
||||
end
|
||||
|
||||
return table.concat( buf )
|
||||
end
|
||||
Reference in New Issue
Block a user