struct Error <: Exception str::String T code end Error(buf::AbstractString, T, code, pos, tlen) = Error(buf, T, code) Error(buf::AbstractVector{UInt8}, T, code, pos, tlen) = Error(String(buf[pos:(pos + tlen - 1)]), T, code) function Error(buf::IO, T, code, pos, tlen) fastseek!(buf, pos - 1) bytes = read(buf, tlen) return Error(String(bytes), T, code) end function Base.showerror(io::IO, e::Error) c = e.code println(io, "Parsers.Error ($(codes(c))):") println(io, text(c)) println(io, "attempted to parse $(e.T) from: \"$(escape_string(e.str))\"") end # backwards compat neededdigits(::Type{Float64}) = 309 + 17 neededdigits(::Type{Float32}) = 39 + 9 + 2 neededdigits(::Type{Float16}) = 9 + 5 + 9 """ A bitmask value, with various bits corresponding to different parsing signals and scenarios. `Parsers.xparse` returns a `code` value with various bits set according to the various scenarios encountered while parsing a value. * `INVALID`: there are a number of invalid parsing states, all include the INVALID bit set (check via `Parsers.invalid(code)`) * `OK`: signals specifically that a valid value of type `T` was parsed (check via `Parsers.ok(code)`) * `SENTINEL`: signals that a valid sentinel value was detected while parsing, passed via the `sentinel` keyword argument to `Parsers.Options` (check via `Parsers.sentinel(code)`) * `QUOTED`: a `openquotechar` from `Parsers.Options` was detected at the beginning of parsing (check via `Parsers.quoted(code)`) * `DELIMITED`: a `delim` character or string from `Parsers.Options` was detected while parsing (check via `Parsers.delimited(code)`) * `NEWLINE`: a non-quoted newline character (`'\\n'`), return character (`'\\r'`), or CRLF (`"\\r\\n"`) was detected while parsing (check via `Parsers.newline(code)`) * `EOF`: the end of file was reached while parsing * `ESCAPED_STRING`: an `escapechar` from `Parsers.Options` was encountered while parsing (check via `Parsers.escapedstring(code)`) * `INVALID_QUOTED_FIELD`: a `openquotechar` were detected when parsing began, but no corresponding `closequotechar` were found to correctly close a quoted field, this is usually a fatal parsing error because parsing will continue until EOF to look for the close quote character (check via `Parsers.invalidquotedfield(code)`) * `INVALID_DELIMITER`: a `delim` character or string were eventually detected, but not at the expected position (directly after parsing a valid value), indicating there are extra, invalid characters between a valid value and the expected delimiter (check via `Parsers.invaliddelimiter(code)`) * `OVERFLOW`: overflow occurred while parsing a type, like `Integer`, that have limits on valid values (check via `Parsers.overflow(code)`) One additional convenience function is provided, `Parsers.quotednotescaped(code)`, which checks if a value was quoted, but didn't contain any escape characters, useful to indicate if a string may be used "as-is", instead of needing to be unescaped. """ const ReturnCode = Int16 const SUCCESS = 0b0000000000000000 % ReturnCode const INVALID = 0b1000000000000000 % ReturnCode # success flags const OK = 0b0000000000000001 % ReturnCode const SENTINEL = 0b0000000000000010 % ReturnCode # property flags const QUOTED = 0b0000000000000100 % ReturnCode const DELIMITED = 0b0000000000001000 % ReturnCode const NEWLINE = 0b0000000000010000 % ReturnCode const EOF = 0b0000000000100000 % ReturnCode const ESCAPED_STRING = 0b0000001000000000 % ReturnCode const SPECIAL_VALUE = 0b0000010000000000 % ReturnCode # invalid flags const INVALID_QUOTED_FIELD = 0b1000000001000000 % ReturnCode const INVALID_DELIMITER = 0b1000000010000000 % ReturnCode const OVERFLOW = 0b1000000100000000 % ReturnCode const INVALID_TOKEN = 0b1000010000000000 % ReturnCode const INEXACT = 0b1000100000000000 % ReturnCode valueok(x::ReturnCode) = (x & OK) == OK ok(x::ReturnCode) = (x & (OK | INVALID)) == OK invalid(x::ReturnCode) = x < SUCCESS sentinel(x::ReturnCode) = (x & SENTINEL) == SENTINEL quoted(x::ReturnCode) = (x & QUOTED) == QUOTED delimited(x::ReturnCode) = (x & DELIMITED) == DELIMITED newline(x::ReturnCode) = (x & NEWLINE) == NEWLINE escapedstring(x::ReturnCode) = (x & ESCAPED_STRING) == ESCAPED_STRING specialvalue(x::ReturnCode) = (x & SPECIAL_VALUE) == SPECIAL_VALUE invalidquotedfield(x::ReturnCode) = (x & INVALID_QUOTED_FIELD) == INVALID_QUOTED_FIELD invaliddelimiter(x::ReturnCode) = (x & INVALID_DELIMITER) == INVALID_DELIMITER overflow(x::ReturnCode) = (x & OVERFLOW) == OVERFLOW inexact(x::ReturnCode) = (x & INEXACT) == INEXACT quotednotescaped(x::ReturnCode) = (x & (QUOTED | ESCAPED_STRING)) == QUOTED invalidtoken(x::ReturnCode) = (x & INVALID_TOKEN) == INVALID_TOKEN eof(x::ReturnCode) = (x & EOF) == EOF memcmp(a::Ptr{UInt8}, b::Ptr{UInt8}, len::Int) = ccall(:memcmp, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), a, b, len) == 0 struct RegexAndMatchData re::Regex data::Ptr{Cvoid} end function mkregex(re::Regex) Base.compile(re) return RegexAndMatchData(re, Base.PCRE.create_match_data(re.regex)) end const ByteStringRegex = Union{UInt8, String, RegexAndMatchData} struct Token token::ByteStringRegex end import Base: == function ==(a::Token, b::Token) t1 = a.token t2 = b.token if t1 isa UInt8 && t2 isa UInt8 return t1 == t2 elseif t1 isa String && t2 isa String return t1 == t2 elseif t1 isa RegexAndMatchData && t2 isa RegexAndMatchData return t1.re == t2.re else return false end end ==(a::Token, b::UInt8) = a.token isa UInt8 && a.token == b ==(a::UInt8, b::Token) = (b == a) # methods for `_contains(::Token, ::String)`: _contains(a::Token, str::String) = _contains(a.token, str) _contains(a::UInt8, str::String) = a == UInt8(str[1]) _contains(a::Char, str::String) = a == str[1] _contains(a::RegexAndMatchData, str::String) = contains(a.re.pattern, str) _contains(a::Token, char::Char) = _contains(a.token, char) _contains(a::UInt8, char::Char) = ncodeunits(char) == 1 && (Base.zext_int(UInt32, a) << 24) == Base.bitcast(UInt32, char) _contains(a::Char, char::Char) = a == char _contains(a::RegexAndMatchData, char::Char) = contains(a.re.pattern, char) _contains(a, b::UInt8) = _contains(a, Char(b)) _contains(a, b) = _contains(a, string(b)) _contains(a, b::Nothing) = false _contains(a::String, str::String) = contains(a, str) # methods for `_contains(::String, ::MaybeToken)`: _contains(a::String, b::UInt8) = _contains(a, Char(b)) _contains(a::String, b::Char) = _contains(a, string(b)) _contains(a::String, b::Regex) = contains(a, b.pattern) _contains(a::String, b::Nothing) = false function Base.isempty(x::Token) t = x.token return t isa String && isempty(t) end @noinline notsupported() = throw(ArgumentError("Regex matching not supported on this input type")) @inline function checktoken(source, pos, len, b, token::Token) tok = token.token if tok isa UInt8 check = tok == b check && incr!(source) return check, pos + check elseif tok isa String if source isa Vector{UInt8} # specialize common case return checktoken(source, pos, len, b, tok) else return checktoken(source, pos, len, b, tok) end elseif tok isa RegexAndMatchData if source isa Vector{UInt8} || source isa Base.CodeUnits{UInt8, String} || source isa AbstractVector{UInt8} return checktoken(source, pos, len, b, tok) else notsupported() end else error() # unreachable end end @inline function checktoken(source, pos, len, b, tok::UInt8) check = tok == b check && incr!(source) return check, pos + check end @inline function checktoken(source::AbstractVector{UInt8}, pos, len, b, tok::RegexAndMatchData) rc = ccall((:pcre2_match_8, Base.PCRE.PCRE_LIB), Cint, (Ptr{Cvoid}, Ptr{UInt8}, Csize_t, Csize_t, UInt32, Ptr{Cvoid}, Ptr{Cvoid}), tok.re.regex, source, len, pos - 1, tok.re.match_options, tok.data, Base.PCRE.get_local_match_context()) rc < -2 && error("PCRE.exec error: $(Base.PCRE.err_message(rc))") check = rc >= 0 return check, pos + (!check ? 0 : Base.PCRE.substring_length_bynumber(tok.data, 0)) end @inline function checktoken(source::AbstractVector{UInt8}, pos, len, b, tok::String) sz = sizeof(tok) check = (pos + sz - 1) <= len && memcmp(pointer(source, pos), pointer(tok), sz) return check, pos + (check * sz) end @inline function checktoken(source::IO, pos, len, b, tok::String) bytes = codeunits(tok) startpos = pos blen = length(bytes) for i = 1:blen @inbounds b2 = bytes[i] if b2 != b fastseek!(source, startpos - 1) return false, startpos end pos += 1 incr!(source) i == blen && break if eof(source, pos, len) fastseek!(source, startpos - 1) return false, startpos end b = peekbyte(source, pos) end return true, pos end function checktokens(source, pos, len, b, tokens::Union{Vector{String}, Vector{Token}}, consume=false) if source isa IO && !consume origpos = position(source) end for token in tokens check, pos = checktoken(source, pos, len, b, token) if check source isa IO && !consume && fastseek!(source, origpos) return true, pos end end source isa IO && !consume && fastseek!(source, origpos) return false, pos end function checkcmtemptylines(source, pos, len, cmt, ignoreemptylines) while !eof(source, pos, len) skipped = false if ignoreemptylines b = peekbyte(source, pos) if b == UInt8('\n') pos += 1 incr!(source) skipped = true elseif b == UInt8('\r') pos += 1 incr!(source) if !eof(source, pos, len) && peekbyte(source, pos) == UInt8('\n') pos += 1 incr!(source) end skipped = true end end matched = false if !isempty(cmt) && !eof(source, pos, len) b = peekbyte(source, pos) matched, pos = checktoken(source, pos, len, b, cmt) if matched eof(source, pos, len) && break b = peekbyte(source, pos) while true # consume the rest of the line/row until we hit the newline if b == UInt8('\n') pos += 1 incr!(source) break elseif b == UInt8('\r') pos += 1 incr!(source) if !eof(source, pos, len) && peekbyte(source, pos) == UInt8('\n') pos += 1 incr!(source) end break end pos += 1 incr!(source) eof(source, pos, len) && break b = peekbyte(source, pos) end end end (skipped | matched) || break end return pos end """ Parsers.fastseek!(io::IO, n::Integer) Without valididty checks, seek an `IO` to desired byte position `n`. Used in Parsers.jl to seek back to a previous location already parsed. """ function fastseek! end fastseek!(io::IO, n::Integer) = seek(io, n) function fastseek!(io::IOBuffer, n::Integer) io.ptr = n + 1 return end fastseek!(io::Union{AbstractVector{UInt8}, AbstractString}, n::Integer) = nothing """ Parsers.readbyte(io::IO)::UInt8 Consume a single byte from an `IO` without checking `eof(io)`. """ function readbyte end """ Parsers.peekbyte(io::IO)::UInt8 Return, but do not consume, the next byte from an `IO` without checking `eof(io)`. """ function peekbyte end incr!(io::IO) = readbyte(io) readbyte(from::IO) = Base.read(from, UInt8) peekbyte(from::IO) = UInt8(Base.peek(from)) function readbyte(from::IOBuffer) i = from.ptr @inbounds byte = from.data[i] from.ptr = i + 1 return byte end function peekbyte(from::IOBuffer) @inbounds byte = from.data[from.ptr] return byte end function incr!(from::IOBuffer) from.ptr += 1 return end incr!(::Union{AbstractVector{UInt8}, AbstractString}) = nothing peekbyte(from::IO, pos) = peekbyte(from) function peekbyte(from::AbstractVector{UInt8}, pos) @inbounds b = from[pos] return b end function peekbyte(from::AbstractString, pos) @inbounds b = codeunit(from, pos) return b end eof(::AbstractVector{UInt8}, pos::Integer, len::Integer) = pos > len eof(::AbstractString, pos::Integer, len::Integer) = pos > len eof(source::IO, pos::Integer, len::Integer) = Base.eof(source) eof(io::Base.GenericIOBuffer, pos::Integer, len::Integer) = (io.ptr - 1) >= io.size function text(r) str = "" if r & QUOTED > 0 str = "encountered an opening quote character, initial value parsing " else str = "initial value parsing " end if r & OK > 0 str *= "succeeded" else str *= "failed" end if r & (~INVALID & OVERFLOW) > 0 str *= ", value overflowed" end if r & (~INVALID & INEXACT) > 0 str *= ", value is inexactly represented" end if r & SENTINEL > 0 str *= ", a sentinel value was parsed" end if r & ESCAPED_STRING > 0 str *= ", encountered escape character" end if r & (~INVALID & INVALID_QUOTED_FIELD) > 0 str *= ", invalid quoted value" end if r & DELIMITED > 0 str *= ", a valid delimiter was parsed" end if r & NEWLINE > 0 str *= ", a newline was encountered" end if r & EOF > 0 str *= ", reached EOF" end if r & (~INVALID & INVALID_DELIMITER) > 0 str *= ", invalid delimiter" end return str end codes(r) = chop(chop(string( ifelse(r > 0, "SUCCESS: ", "INVALID: "), ifelse(r & OK > 0, "OK | ", ""), ifelse(r & SENTINEL > 0, "SENTINEL | ", ""), ifelse(r & QUOTED > 0, "QUOTED | ", ""), ifelse(r & ESCAPED_STRING > 0, "ESCAPED_STRING | ", ""), ifelse(r & DELIMITED > 0, "DELIMITED | ", ""), ifelse(r & NEWLINE > 0, "NEWLINE | ", ""), ifelse(r & EOF > 0, "EOF | ", ""), ifelse(r & (~INVALID & INVALID_QUOTED_FIELD) > 0, "INVALID_QUOTED_FIELD | ", ""), ifelse(r & (~INVALID & INVALID_DELIMITER) > 0, "INVALID_DELIMITER | ", ""), ifelse(r & (~INVALID & OVERFLOW) > 0, "OVERFLOW | ", ""), ifelse(r & (~INVALID & INEXACT) > 0, "INEXACT | ", ""), ))) """ PosLen(pos, len, ismissing, escaped) A custom 64-bit primitive that allows efficiently storing the byte position and length of a value within a byte array, along with whether a sentinel value was parsed, and whether the parsed value includes escaped characters. Specifically, the use of 64-bits is: * 1 bit to indicate whether a sentinel value was encountered while parsing * 1 bit to indicate whether the escape character was encountered while parsing * 42 bits to note the byte position as an integer where a value is located in a byte array (max array size ~4.4TB) * 20 bits to note the length of a parsed value (max length of ~1MB) These individual "fields" can be retrieved via dot access, like `PosLen.missingvalue`, `PosLen.escapedvalue`, `PosLen.pos`, `PosLen.len`. `Parsers.xparse(String, buf, pos, len, opts)` returns `Parsers.Result{PosLen}`, where the `x.val` is a `PosLen`. """ primitive type PosLen 64 end primitive type PosLen31 64 end const MISSING_BIT = Base.bitcast(Int64, 0x8000000000000000) const ESCAPE_BIT = Base.bitcast(Int64, 0x4000000000000000) _pos_shift(::Union{PosLen,Type{PosLen}}) = 20 _max_pos(::Union{PosLen,Type{PosLen}}) = 4398046511104 _max_len(::Union{PosLen,Type{PosLen}}) = 1048575 _pos_bits(::Union{PosLen,Type{PosLen}}) = Base.bitcast(Int64, 0x3ffffffffff00000) _len_bits(::Union{PosLen,Type{PosLen}}) = Base.bitcast(Int64, 0x00000000000fffff) _pos_shift(::Union{PosLen31,Type{PosLen31}}) = 31 _max_pos(::Union{PosLen31,Type{PosLen31}}) = Int64(2147483648) _max_len(::Union{PosLen31,Type{PosLen31}}) = Int64(2147483647) _pos_bits(::Union{PosLen31,Type{PosLen31}}) = Base.bitcast(Int64, 0x3fffffff80000000) _len_bits(::Union{PosLen31,Type{PosLen31}}) = Base.bitcast(Int64, 0x000000007fffffff) @noinline postoolarge(::Type{T}, pos) where {T} = throw(ArgumentError("position argument to $T ($pos) is too large; max position allowed is $(_max_pos(T))")) @noinline lentoolarge(::Type{T}, len) where {T} = throw(ArgumentError("length argument to $T ($len) is too large; max length allowed is $(_max_len(T))")) for T in (:PosLen, :PosLen31) @eval @inline function $T(pos::Integer, len::Integer, ismissing=false, escaped=false) pos > _max_pos($T) && postoolarge($T, pos) len > _max_len($T) && lentoolarge($T, len) @assert pos >= 0 @assert len >= 0 pos = Int64(pos) << _pos_shift($T) pos |= ifelse(ismissing, MISSING_BIT, 0) pos |= ifelse(escaped, ESCAPE_BIT, 0) return Base.bitcast($T, pos | Int64(len)) end @eval @noinline invalidproperty(::Type{$T}, nm) = throw(ArgumentError("invalid property $nm for $($T)")) @eval function Base.getproperty(x::$T, nm::Symbol) y = Base.bitcast(Int64, x) nm === :pos && return (y & _pos_bits($T)) >> _pos_shift($T) nm === :len && return y & _len_bits($T) nm === :missingvalue && return (y & MISSING_BIT) == MISSING_BIT nm === :escapedvalue && return (y & ESCAPE_BIT) == ESCAPE_BIT invalidproperty($T, nm) end @eval Base.propertynames(::$T) = (:pos, :len, :missingvalue, :escapedvalue) @eval Base.show(io::IO, x::$T) = print(io, "$($T)(pos=$(x.pos), len=$(x.len), missingvalue=$(x.missingvalue), escapedvalue=$(x.escapedvalue))") end poslen(pos::Integer, len::Integer) = poslen(PosLen, pos, len) poslen(::Type{T}, pos::Integer, len::Integer) where {T} = Base.bitcast(PosLen, (Int64(pos) << _pos_shift(PosLen)) | Int64(len)) poslen(::Type{PosLen31}, pos::Integer, len::Integer) = Base.bitcast(PosLen31, (Int64(pos) << _pos_shift(PosLen31)) | Int64(len)) withlen(pl::T, len::Integer) where {T<:Union{PosLen,PosLen31}} = Base.bitcast(T, (Base.bitcast(Int64, pl) & (~_max_len(T))) | Int64(len)) withmissing(pl::T) where {T<:Union{PosLen,PosLen31}} = Base.or_int(pl, Base.bitcast(T, MISSING_BIT)) withescaped(pl::T) where {T<:Union{PosLen,PosLen31}} = Base.or_int(pl, Base.bitcast(T, ESCAPE_BIT)) """ Parsers.getstring(buf_or_io, poslen::PosLen, e::UInt8) => String When calling `Parsers.xparse` with a `String` type argument, a `Parsers.Result{PosLen}` is returned, which has 3 fields: * `val`: a [`PosLen`](@ref) value which stores the starting byte position and length of the parsed string value * `code`: a parsing return code indicating success/failure * `tlen`: the total number of bytes parsed, which may differ from `val.len` if delimiters or open/close quotes were parsed If the actual parsed `String` _is_ needed, however, you can pass your source and the `res.val::PosLen` to `Parsers.getstring` to get the actual parsed `String` value. """ function getstring end _unsafe_string(p, len) = ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len) getstring(source::Union{IO, AbstractVector{UInt8}}, x::Union{PosLen,PosLen31}, e::Token) = getstring(source, x, e.token::UInt8) @inline function getstring(source::Union{IO, AbstractVector{UInt8}}, x::Union{PosLen,PosLen31}, e::UInt8) x.escapedvalue && return unescape(source, x, e) if source isa AbstractVector{UInt8} return _unsafe_string(pointer(source, x.pos), x.len) else pos = position(source) vpos, vlen = x.pos, x.len fastseek!(source, vpos - 1) str = Base.StringVector(vlen) readbytes!(source, str, vlen) fastseek!(source, pos) # reset IO to earlier position return String(str) end end getstring(str::AbstractString, pl::Union{PosLen,PosLen31}, e::UInt8) = getstring(codeunits(str), pl, e) # if a cell value of a csv file has escape characters, we need to unescape it @noinline function unescape(origbuf, x::Union{PosLen,PosLen31}, e) n = x.len if origbuf isa AbstractVector{UInt8} source = view(origbuf, x.pos:(x.pos + x.len - 1)) else origpos = position(origbuf) fastseek!(origbuf, x.pos - 1) source = origbuf end out = Base.StringVector(n) len = 1 i = 1 @inbounds begin while i <= n b = peekbyte(source, i) if b == e incr!(source) i += 1 b = peekbyte(source, i) end out[len] = b len += 1 incr!(source) i += 1 end end if origbuf isa IO fastseek!(origbuf, origpos) end resize!(out, len - 1) return String(out) end