module Tokenize

export tokenize, untokenize

using ..JuliaSyntax: JuliaSyntax, Kind, @K_str, @KSet_str

import ..JuliaSyntax: kind,
    is_literal, is_error, is_contextual_keyword, is_word_operator

#-------------------------------------------------------------------------------
# Character-based predicates for tokenization
import Base.Unicode

const EOF_CHAR = typemax(Char)

function is_identifier_char(c::Char)
    c == EOF_CHAR && return false
    isvalid(c) || return false
    return Base.is_id_char(c)
end

function is_identifier_start_char(c::Char)
    c == EOF_CHAR && return false
    isvalid(c) || return false
    return Base.is_id_start_char(c)
end

function is_invisible_char(c::Char)
    # These are the chars considered invisible by the reference parser.
    # TODO: There's others we could add? See for example
    # https://invisible-characters.com/
    return c == '\u00ad' || # soft hyphen
           c == '\u200b' || # zero width space
           c == '\u200c' || # zero width non-joiner
           c == '\u200d' || # zero width joiner
           c == '\u200e' || # left-to-right mark
           c == '\u200f' || # right-to-left mark
           c == '\u2060' || # word joiner
           c == '\u2061'    # function application
    # https://github.com/JuliaLang/julia/issues/49850
    # c == '\u115f' || # Hangul Choseong filler
end

# Chars that we will never allow to be part of a valid non-operator identifier
function is_never_id_char(ch::Char)
    isvalid(ch) || return true
    cat = Unicode.category_code(ch)
    c = UInt32(ch)
    return (
        # spaces and control characters:
        (cat >= Unicode.UTF8PROC_CATEGORY_ZS && cat <= Unicode.UTF8PROC_CATEGORY_CS) ||

        # ASCII and Latin1 non-connector punctuation
        (c < 0xff &&
         cat >= Unicode.UTF8PROC_CATEGORY_PD && cat <= Unicode.UTF8PROC_CATEGORY_PO) ||

        c == UInt32('`') ||

        # mathematical brackets
        (c >= 0x27e6 && c <= 0x27ef) ||
        # angle, corner, and lenticular brackets
        (c >= 0x3008 && c <= 0x3011) ||
        # tortoise shell, square, and more lenticular brackets
        (c >= 0x3014 && c <= 0x301b) ||
        # fullwidth parens
        (c == 0xff08 || c == 0xff09) ||
        # fullwidth square brackets
        (c == 0xff3b || c == 0xff3d)
    )
end

readchar(io::IO) = eof(io) ? EOF_CHAR : read(io, Char)

# Some unicode operators are normalized by the tokenizer into their equivalent
# kinds. See also normalize_identifier()
const _ops_with_unicode_aliases = [
    # \minus '−' is normalized into K"-",
    '−' => K"-"
    # Lookalikes which are normalized into K"⋅",
    # https://github.com/JuliaLang/julia/pull/25157,
    '\u00b7' => K"⋅" # '·' Middle Dot,,
    '\u0387' => K"⋅" # '·' Greek Ano Teleia,,
]

function _nondot_symbolic_operator_kinds()
    op_range = reinterpret(UInt16, K"BEGIN_OPS"):reinterpret(UInt16, K"END_OPS")
    setdiff(reinterpret.(Kind, op_range), [
        K"ErrorInvalidOperator"
        K"Error**"
        K"..."
        K"."
        K"where"
        K"isa"
        K"in"
        K".'"
    ])
end

function _char_in_set_expr(varname, firstchars)
    codes = sort!(UInt32.(unique(firstchars)))
    terms = []
    i = 1
    while i <= length(codes)
        j = i
        while j < length(codes) && codes[j+1] == codes[j]+1
            j += 1
        end
        if i == j
            push!(terms, :($varname == $(codes[i])))
        else
            push!(terms, :($(codes[i]) <= $varname <= $(codes[j])))
        end
        i = j+1
    end
    foldr((t1,t2)->:($t1 || $t2), terms)
end

@eval function is_operator_start_char(c)
   if c == EOF_CHAR || !isvalid(c)
       return false
   end
   u = UInt32(c)
   return $(_char_in_set_expr(:u,
       append!(first.(string.(_nondot_symbolic_operator_kinds())),
               first.(_ops_with_unicode_aliases))))
end

# Checks whether a Char is an operator which can be prefixed with a dot `.`
function is_dottable_operator_start_char(c)
    return c != '?' && c != '$' && c != ':' && c != '\'' && is_operator_start_char(c)
end

@eval function isopsuffix(c::Char)
    c == EOF_CHAR && return false
    isvalid(c) || return false
    u = UInt32(c)
    if (u < 0xa1 || u > 0x10ffff)
        return false
    end
    cat = Base.Unicode.category_code(u)
    if (cat == Base.Unicode.UTF8PROC_CATEGORY_MN ||
        cat == Base.Unicode.UTF8PROC_CATEGORY_MC ||
        cat == Base.Unicode.UTF8PROC_CATEGORY_ME)
        return true
    end
    # Additional allowed cases
    return $(_char_in_set_expr(:u,
        collect("²³¹ʰʲʳʷʸˡˢˣᴬᴮᴰᴱᴳᴴᴵᴶᴷᴸᴹᴺᴼᴾᴿᵀᵁᵂᵃᵇᵈᵉᵍᵏᵐᵒᵖᵗᵘᵛᵝᵞᵟᵠᵡᵢᵣᵤᵥᵦᵧᵨᵩᵪᶜᶠᶥᶦᶫᶰᶸᶻᶿ′″‴‵‶‷⁗⁰ⁱ⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎ₐₑₒₓₕₖₗₘₙₚₛₜⱼⱽꜛꜜꜝ")))
end

function optakessuffix(k)
    (K"BEGIN_OPS" <= k <= K"END_OPS") &&
    !(
        k == K"..." ||
        K"BEGIN_ASSIGNMENTS" <= k <= K"END_ASSIGNMENTS" ||
        k == K"?"   ||
        k == K"<:"  ||
        k == K">:"  ||
        k == K"&&"  ||
        k == K"||"  ||
        k == K"in"  ||
        k == K"isa" ||
        k == K"≔"   ||
        k == K"⩴"   ||
        k == K":"   ||
        k == K".."  ||
        k == K"$"   ||
        k == K"::"  ||
        k == K"where" ||
        k == K"."   ||
        k == K"!"   ||
        k == K".'"  ||
        k == K"->"  ||
        K"¬" <= k <= K"∜"
    )
end

const _unicode_ops = let
    ks = _nondot_symbolic_operator_kinds()
    ss = string.(ks)

    ops = Dict{Char, Kind}([first(s)=>k for (k,s) in zip(ks,ss)
                            if length(s) == 1 && !isascii(s[1])])
    for ck in _ops_with_unicode_aliases
        push!(ops, ck)
    end
    ops
end

#-------------------------------------------------------------------------------
# Tokens

struct RawToken
    kind::Kind
    # Offsets into a string or buffer
    startbyte::Int # The byte where the token start in the buffer
    endbyte::Int # The byte where the token ended in the buffer
    dotop::Bool
    suffix::Bool
end
function RawToken(kind::Kind, startbyte::Int, endbyte::Int)
    RawToken(kind, startbyte, endbyte, false, false)
end
RawToken() = RawToken(K"error", 0, 0, false, false)

const EMPTY_TOKEN = RawToken()

kind(t::RawToken) = t.kind

startbyte(t::RawToken) = t.startbyte
endbyte(t::RawToken) = t.endbyte


function untokenize(t::RawToken, str::String)
    String(codeunits(str)[1 .+ (t.startbyte:t.endbyte)])
end

function Base.show(io::IO, t::RawToken)
    print(io, rpad(string(startbyte(t), "-", endbyte(t)), 11, " "))
    print(io, rpad(kind(t), 15, " "))
end

#-------------------------------------------------------------------------------
# Lexer

@inline ishex(c::Char) = isdigit(c) || ('a' <= c <= 'f') || ('A' <= c <= 'F')
@inline isbinary(c::Char) = c == '0' || c == '1'
@inline isoctal(c::Char) =  '0' ≤ c ≤ '7'
@inline iswhitespace(c::Char) = (isvalid(c) && Base.isspace(c)) || c === '\ufeff'

struct StringState
    triplestr::Bool
    raw::Bool
    delim::Char
    paren_depth::Int
end

"""
`Lexer` reads from an input stream and emits a single token each time
`next_token` is called.

Ideally a lexer is stateless but some state is needed here for:
* Disambiguating cases like x' (adjoint) vs 'x' (character literal)
* Tokenizing code within string interpolations
"""
mutable struct Lexer{IO_t <: IO}
    io::IO_t

    token_startpos::Int

    last_token::Kind
    string_states::Vector{StringState}
    chars::Tuple{Char,Char,Char,Char}
    charspos::Tuple{Int,Int,Int,Int}
    dotop::Bool
end

function Lexer(io::IO)
    c1 = ' '
    p1 = position(io)
    if eof(io)
        c2, p2 = EOF_CHAR, p1
        c3, p3 = EOF_CHAR, p1
        c4, p4 = EOF_CHAR, p1
    else
        c2 = read(io, Char)
        p2 = position(io)
        if eof(io)
            c3, p3 = EOF_CHAR, p2
            c4, p4 = EOF_CHAR, p2
        else
            c3 = read(io, Char)
            p3 = position(io)
            if eof(io)
                c4, p4 = EOF_CHAR, p3
            else
                c4 = read(io, Char)
                p4 = position(io)
            end
        end
    end
    Lexer(io, position(io),
                  K"error", Vector{StringState}(),
                  (c1,c2,c3,c4), (p1,p2,p3,p4), false)
end
Lexer(str::AbstractString) = Lexer(IOBuffer(str))

"""
    tokenize(x)

Returns an `Iterable` containing the tokenized input. Can be reverted by e.g.
`join(untokenize.(tokenize(x)))`.
"""
tokenize(x) = Lexer(x)

# Iterator interface
Base.IteratorSize(::Type{<:Lexer}) = Base.SizeUnknown()
Base.IteratorEltype(::Type{<:Lexer}) = Base.HasEltype()
Base.eltype(::Type{<:Lexer}) = RawToken


function Base.iterate(l::Lexer)
    l.token_startpos = position(l)

    t = next_token(l)
    return t, t.kind == K"EndMarker"
end

function Base.iterate(l::Lexer, isdone::Any)
    isdone && return nothing
    t = next_token(l)
    return t, t.kind == K"EndMarker"
end

function Base.show(io::IO, l::Lexer)
    print(io, typeof(l), " at position: ", position(l))
end

"""
    startpos(l::Lexer)

Return the latest `RawToken`'s starting position.
"""
startpos(l::Lexer) = l.token_startpos

"""
    startpos!(l::Lexer, i::Integer)

Set a new starting position.
"""
startpos!(l::Lexer, i::Integer) = l.token_startpos = i

"""
    peekchar(l::Lexer)

Returns the next character without changing the lexer's state.
"""
peekchar(l::Lexer) = l.chars[2]

"""
dpeekchar(l::Lexer)

Returns the next two characters without changing the lexer's state.
"""
dpeekchar(l::Lexer) = l.chars[2], l.chars[3]

"""
peekchar3(l::Lexer)

Returns the next three characters without changing the lexer's state.
"""
peekchar3(l::Lexer) = l.chars[2], l.chars[3], l.chars[4]

"""
    position(l::Lexer)

Returns the current position.
"""
Base.position(l::Lexer) = l.charspos[1]

"""
    eof(l::Lexer)

Determine whether the end of the lexer's underlying buffer has been reached.
"""
Base.eof(l::Lexer) = eof(l.io)

Base.seek(l::Lexer, pos) = seek(l.io, pos)

"""
    start_token!(l::Lexer)

Updates the lexer's state such that the next  `RawToken` will start at the current
position.
"""
function start_token!(l::Lexer)
    l.token_startpos = l.charspos[1]
end

"""
    readchar(l::Lexer)

Returns the next character and increments the current position.
"""
function readchar(l::Lexer)
    c = readchar(l.io)
    l.chars = (l.chars[2], l.chars[3], l.chars[4], c)
    l.charspos = (l.charspos[2], l.charspos[3], l.charspos[4], position(l.io))
    return l.chars[1]
end

"""
    accept(l::Lexer, f::Union{Function, Char, Vector{Char}, String})

Consumes the next character `c` if either `f::Function(c)` returns true, `c == f`
for `c::Char` or `c in f` otherwise. Returns `true` if a character has been
consumed and `false` otherwise.
"""
@inline function accept(l::Lexer, f::Union{Function, Char, Vector{Char}, String})
    c = peekchar(l)
    if isa(f, Function)
        ok = f(c)
    elseif isa(f, Char)
        ok = c == f
    else
        ok = c in f
    end
    ok && readchar(l)
    return ok
end

"""
    accept_batch(l::Lexer, f)

Consumes all following characters until `accept(l, f)` is `false`.
"""
@inline function accept_batch(l::Lexer, f)
    ok = false
    while accept(l, f)
        ok = true
    end
    return ok
end

"""
    emit(l::Lexer, kind::Kind)

Returns a `RawToken` of kind `kind` with contents `str` and starts a new `RawToken`.
"""
function emit(l::Lexer, kind::Kind, maybe_op=true)
    suffix = false
    if optakessuffix(kind) && maybe_op
        while isopsuffix(peekchar(l))
            readchar(l)
            suffix = true
        end
    end

    tok = RawToken(kind, startpos(l), position(l) - 1, l.dotop, suffix)

    l.dotop = false
    l.last_token = kind
    return tok
end

"""
    next_token(l::Lexer)

Returns the next `RawToken`.
"""
function next_token(l::Lexer, start = true)
    start && start_token!(l)
    if !isempty(l.string_states)
        lex_string_chunk(l)
    else
        _next_token(l, readchar(l))
    end
end

function _next_token(l::Lexer, c)
    if c == EOF_CHAR
        return emit(l, K"EndMarker")
    elseif iswhitespace(c)
        return lex_whitespace(l, c)
    elseif c == '['
        return emit(l, K"[")
    elseif c == ']'
        return emit(l, K"]")
    elseif c == '{'
        return emit(l, K"{")
    elseif c == ';'
        return emit(l, K";")
    elseif c == '}'
        return emit(l, K"}")
    elseif c == '('
        return emit(l, K"(")
    elseif c == ')'
        return emit(l, K")")
    elseif c == ','
        return emit(l, K",")
    elseif c == '*'
        return lex_star(l);
    elseif c == '^'
        return lex_circumflex(l);
    elseif c == '@'
        return emit(l, K"@")
    elseif c == '?'
        return emit(l, K"?")
    elseif c == '$'
        return lex_dollar(l);
    elseif c == '⊻'
        return lex_xor(l);
    elseif c == '~'
        return emit(l, K"~")
    elseif c == '#'
        return lex_comment(l)
    elseif c == '='
        return lex_equal(l)
    elseif c == '!'
        return lex_exclaim(l)
    elseif c == '>'
        return lex_greater(l)
    elseif c == '<'
        return lex_less(l)
    elseif c == ':'
        return lex_colon(l)
    elseif c == '|'
        return lex_bar(l)
    elseif c == '&'
        return lex_amper(l)
    elseif c == '\''
        return lex_prime(l)
    elseif c == '÷'
        return lex_division(l)
    elseif c == '"'
        return lex_quote(l);
    elseif c == '%'
        return lex_percent(l);
    elseif c == '/'
        return lex_forwardslash(l);
    elseif c == '\\'
        return lex_backslash(l);
    elseif c == '.'
        return lex_dot(l);
    elseif c == '+'
        return lex_plus(l);
    elseif c == '-'
        return lex_minus(l);
    elseif c == '−' # \minus '−' treated as hyphen '-'
        return emit(l, accept(l, '=') ? K"-=" : K"-")
    elseif c == '`'
        return lex_backtick(l);
    elseif is_identifier_start_char(c)
        return lex_identifier(l, c)
    elseif isdigit(c)
        return lex_digit(l, K"Integer")
    elseif (k = get(_unicode_ops, c, K"error")) != K"error"
        return emit(l, k)
    else
        emit(l,
            !isvalid(c)          ? K"ErrorInvalidUTF8"   :
            is_invisible_char(c) ? K"ErrorInvisibleChar" :
            K"ErrorUnknownCharacter")
    end
end

# UAX #9: Unicode Bidirectional Algorithm
# https://unicode.org/reports/tr9/
# Very partial implementation - just enough to check correct nesting in strings
# and multiline comments.
function update_bidi_state((embedding_nesting, isolate_nesting), c)
    if c == '\n'
        embedding_nesting = 0
        isolate_nesting = 0
    elseif c == '\U202A' || c == '\U202B' || c == '\U202D' || c == '\U202E' # LRE RLE LRO RLO
        embedding_nesting += 1
    elseif c == '\U202C' # PDF
        embedding_nesting -= 1
    elseif c == '\U2066' || c == '\U2067' || c == '\U2068' # LRI RLI FSI
        isolate_nesting += 1
    elseif c == '\U2069' # PDI
        isolate_nesting -= 1
    end
    return (embedding_nesting, isolate_nesting)
end

# We're inside a string; possibly reading the string characters, or maybe in
# Julia code within an interpolation.
function lex_string_chunk(l)
    state = last(l.string_states)
    if state.paren_depth > 0
        # Read normal Julia code inside an interpolation but track nesting of
        # parentheses.
        # TODO: This stateful tracking should probably, somehow, be done by the
        # parser instead? Especially for recovery of unbalanced parens inside
        # interpolations?
        c = readchar(l)
        if c == '('
            l.string_states[end] = StringState(state.triplestr, state.raw, state.delim,
                                               state.paren_depth + 1)
            return emit(l, K"(")
        elseif c == ')'
            l.string_states[end] = StringState(state.triplestr, state.raw, state.delim,
                                               state.paren_depth - 1)
            return emit(l, K")")
        else
            return _next_token(l, c)
        end
    end
    pc = peekchar(l)
    if l.last_token == K"$"
        pc = peekchar(l)
        # Interpolated symbol or expression
        if pc == '('
            readchar(l)
            l.string_states[end] = StringState(state.triplestr, state.raw, state.delim,
                                               state.paren_depth + 1)
            return emit(l, K"(")
        elseif is_identifier_start_char(pc)
            return lex_identifier(l, readchar(l))
        else
            # Getting here is a syntax error - fall through to reading string
            # characters and let the parser deal with it.
        end
    elseif l.last_token == K"Identifier" &&
            !(pc == EOF_CHAR || is_operator_start_char(pc) || is_never_id_char(pc))
        # Only allow certain characters after interpolated vars
        # https://github.com/JuliaLang/julia/pull/25234
        readchar(l)
        return emit(l, K"ErrorInvalidInterpolationTerminator")
    end
    if pc == EOF_CHAR
        return emit(l, K"EndMarker")
    elseif !state.raw && pc == '$'
        # Start interpolation
        readchar(l)
        return emit(l, K"$")
    elseif !state.raw && pc == '\\' && (pc2 = dpeekchar(l)[2];
                                        pc2 == '\r' || pc2 == '\n')
        # Process escaped newline as whitespace
        readchar(l)
        readchar(l)
        if pc2 == '\r' && peekchar(l) == '\n'
            readchar(l)
        end
        while (pc = peekchar(l); pc == ' ' || pc == '\t')
            readchar(l)
        end
        return emit(l, K"Whitespace")
    elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
        if state.delim == '\'' && l.last_token == K"'" && dpeekchar(l)[2] == '\''
            # Handle '''
            readchar(l)
            return emit(l, K"Char")
        end
        # Terminate string
        pop!(l.string_states)
        readchar(l)
        if state.triplestr
            readchar(l); readchar(l)
            return emit(l, state.delim == '"' ?
                        K"\"\"\"" : K"```")
        else
            return emit(l, state.delim == '"' ? K"\"" :
                           state.delim == '`' ? K"`"  : K"'", false)
        end
    end
    # Read a chunk of string characters
    init_bidi_state = (0,0)
    bidi_state = init_bidi_state
    valid = true
    if state.raw
        # Raw strings treat all characters as literals with the exception that
        # the closing quotes can be escaped with an odd number of \ characters.
        while true
            pc = peekchar(l)
            if string_terminates(l, state.delim, state.triplestr) || pc == EOF_CHAR
                break
            elseif state.triplestr && (pc == '\n' || pc == '\r')
                # triple quoted newline splitting
                readchar(l)
                if pc == '\n'
                    bidi_state = init_bidi_state
                elseif pc == '\r' && peekchar(l) == '\n'
                    bidi_state = init_bidi_state
                    readchar(l)
                end
                break
            end
            c = readchar(l)
            if c == '\\'
                n = 1
                while peekchar(l) == '\\'
                    readchar(l)
                    n += 1
                end
                if peekchar(l) == state.delim && !iseven(n)
                    readchar(l)
                end
            end
            bidi_state = update_bidi_state(bidi_state, c)
            valid &= isvalid(c)
        end
    else
        while true
            pc = peekchar(l)
            if pc == '$' || pc == EOF_CHAR
                break
            elseif state.triplestr && (pc == '\n' || pc == '\r')
                # triple quoted newline splitting
                readchar(l)
                if pc == '\n'
                    bidi_state = init_bidi_state
                elseif pc == '\r' && peekchar(l) == '\n'
                    readchar(l)
                    bidi_state = init_bidi_state
                end
                break
            elseif pc == state.delim && string_terminates(l, state.delim, state.triplestr)
                break
            elseif pc == '\\'
                # Escaped newline
                _, pc2, pc3 = peekchar3(l)
                if pc2 == '\r' || pc2 == '\n'
                    if pc2 == '\n' || pc3 == '\n'
                        bidi_state = init_bidi_state
                    end
                    break
                end
            end
            c = readchar(l)
            if c == '\\'
                c = readchar(l)
                c == EOF_CHAR && break
            end
            bidi_state = update_bidi_state(bidi_state, c)
            valid &= isvalid(c)
        end
    end
    outk = !valid                        ? K"ErrorInvalidUTF8"    :
           state.delim == '\''           ? K"Char"                :
           bidi_state != init_bidi_state ? K"ErrorBidiFormatting" :
           state.delim == '"'            ? K"String"              :
           state.delim == '`'            ? K"CmdString"           :
           (@assert(state.delim in KSet"' \" `"); K"error")
    return emit(l, outk)
end

# Lex whitespace, a whitespace char `c` has been consumed
function lex_whitespace(l::Lexer, c)
    k = K"Whitespace"
    while true
        if c == '\n'
            k = K"NewlineWs"
        end
        pc = peekchar(l)
        # stop on non whitespace and limit to a single newline in a token
        if !iswhitespace(pc) || (k == K"NewlineWs" && pc == '\n')
            break
        end
        c = readchar(l)
    end
    return emit(l, k)
end

function lex_comment(l::Lexer)
    if peekchar(l) != '='
        valid = true
        while true
            pc = peekchar(l)
            if pc == '\n' || pc == EOF_CHAR
                return emit(l, valid ? K"Comment" : K"ErrorInvalidUTF8")
            end
            valid &= isvalid(pc)
            readchar(l)
        end
    else
        c = readchar(l) # consume the '='
        init_bidi_state = (0,0)
        bidi_state = init_bidi_state
        skip = true  # true => c was part of the prev comment marker pair
        nesting = 1
        valid = true
        while true
            if c == EOF_CHAR
                return emit(l, K"ErrorEofMultiComment")
            end
            nc = readchar(l)
            bidi_state = update_bidi_state(bidi_state, nc)
            valid &= isvalid(nc)
            if skip
                skip = false
            else
                if c == '#' && nc == '='
                    nesting += 1
                    skip = true
                elseif c == '=' && nc == '#'
                    nesting -= 1
                    skip = true
                    if nesting == 0
                        outk = !valid ? K"ErrorInvalidUTF8" :
                               bidi_state != init_bidi_state ? K"ErrorBidiFormatting" :
                               K"Comment"
                        return emit(l, outk)
                    end
                end
            end
            c = nc
        end
    end
end

# Lex a greater char, a '>' has been consumed
function lex_greater(l::Lexer)
    if accept(l, '>')
        if accept(l, '>')
            if accept(l, '=')
                return emit(l, K">>>=")
            else # >>>?, ? not a =
                return emit(l, K">>>")
            end
        elseif accept(l, '=')
            return emit(l, K">>=")
        else
            return emit(l, K">>")
        end
    elseif accept(l, '=')
        return emit(l, K">=")
    elseif accept(l, ':')
        return emit(l, K">:")
    else
        return emit(l, K">")
    end
end

# Lex a less char, a '<' has been consumed
function lex_less(l::Lexer)
    if accept(l, '<')
        if accept(l, '=')
            return emit(l, K"<<=")
        else # '<<?', ? not =, ' '
            return emit(l, K"<<")
        end
    elseif accept(l, '=')
        return emit(l, K"<=")
    elseif accept(l, ':')
        return emit(l, K"<:")
    elseif accept(l, '|')
        return emit(l, K"<|")
    elseif dpeekchar(l) == ('-', '-')
        readchar(l); readchar(l)
        if accept(l, '-')
            return emit(l, K"ErrorInvalidOperator")
        else
            if accept(l, '>')
                return emit(l, K"<-->")
            elseif accept(l, '-')
                return emit(l, K"ErrorInvalidOperator")
            else
                return emit(l, K"<--")
            end
        end
    else
        return emit(l, K"<")
    end
end

# Lex all tokens that start with an = character.
# An '=' char has been consumed
function lex_equal(l::Lexer)
    if accept(l, '=')
        if accept(l, '=')
            emit(l, K"===")
        else
            emit(l, K"==")
        end
    elseif accept(l, '>')
        emit(l, K"=>")
    else
        emit(l, K"=")
    end
end

# Lex a colon, a ':' has been consumed
function lex_colon(l::Lexer)
    if accept(l, ':')
        return emit(l, K"::")
    elseif accept(l, '=')
        return emit(l, K":=")
    else
        return emit(l, K":")
    end
end

function lex_exclaim(l::Lexer)
    if accept(l, '=')
        if accept(l, '=')
            return emit(l, K"!==")
        else
            return emit(l, K"!=")
        end
    else
        return emit(l, K"!")
    end
end

function lex_percent(l::Lexer)
    if accept(l, '=')
        return emit(l, K"%=")
    else
        return emit(l, K"%")
    end
end

function lex_bar(l::Lexer)
    if accept(l, '=')
        return emit(l, K"|=")
    elseif accept(l, '>')
        return emit(l, K"|>")
    elseif accept(l, '|')
        return emit(l, K"||")
    else
        emit(l, K"|")
    end
end

function lex_plus(l::Lexer)
    if accept(l, '+')
        return emit(l, K"++")
    elseif accept(l, '=')
        return emit(l, K"+=")
    end
    return emit(l, K"+")
end

function lex_minus(l::Lexer)
    if accept(l, '-')
        if accept(l, '>')
            return emit(l, K"-->")
        else
            return emit(l, K"ErrorInvalidOperator") # "--" is an invalid operator
        end
    elseif !l.dotop && accept(l, '>')
        return emit(l, K"->")
    elseif accept(l, '=')
        return emit(l, K"-=")
    end
    return emit(l, K"-")
end

function lex_star(l::Lexer)
    if accept(l, '*')
        return emit(l, K"Error**") # "**" is an invalid operator use ^
    elseif accept(l, '=')
        return emit(l, K"*=")
    end
    return emit(l, K"*")
end

function lex_circumflex(l::Lexer)
    if accept(l, '=')
        return emit(l, K"^=")
    end
    return emit(l, K"^")
end

function lex_division(l::Lexer)
    if accept(l, '=')
        return emit(l, K"÷=")
    end
    return emit(l, K"÷")
end

function lex_dollar(l::Lexer)
    if accept(l, '=')
        return emit(l, K"$=")
    end
    return emit(l, K"$")
end

function lex_xor(l::Lexer)
    if accept(l, '=')
        return emit(l, K"⊻=")
    end
    return emit(l, K"⊻")
end

function accept_number(l::Lexer, f::F) where {F}
    lexed_number = false
    while true
        pc, ppc = dpeekchar(l)
        if pc == '_' && !f(ppc)
            return lexed_number
        elseif f(pc) || pc == '_'
            readchar(l)
        else
            return lexed_number
        end
        lexed_number = true
    end
end

# A digit has been consumed
function lex_digit(l::Lexer, kind)
    accept_number(l, isdigit)
    pc,ppc = dpeekchar(l)
    if pc == '.'
        if ppc == '.'
            # Number followed by K".." or K"..."
            return emit(l, kind)
        elseif kind === K"Float"
            # If we enter the function with kind == K"Float" then a '.' has been parsed.
            readchar(l)
            return emit(l, K"ErrorInvalidNumericConstant")
        elseif is_dottable_operator_start_char(ppc)
            readchar(l)
            return emit(l, K"ErrorAmbiguousNumericConstant") # `1.+`
        end
        readchar(l)

        kind = K"Float"
        accept(l, '_') && return emit(l, K"ErrorInvalidNumericConstant") # `1._`
        had_fraction_digs = accept_number(l, isdigit)
        pc, ppc = dpeekchar(l)
        if (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
            kind = pc == 'f' ? K"Float32" : K"Float"
            readchar(l)
            accept(l, "+-−")
            if accept_batch(l, isdigit)
                pc,ppc = dpeekchar(l)
                if pc === '.' && !is_dottable_operator_start_char(ppc)
                    readchar(l)
                    return emit(l, K"ErrorInvalidNumericConstant") # `1.e1.`
                end
            else
                return emit(l, K"ErrorInvalidNumericConstant") # `1.e`
            end
        elseif pc == '.' && ppc != '.' && !is_dottable_operator_start_char(ppc)
            readchar(l)
            return emit(l, K"ErrorInvalidNumericConstant") # `1.1.`
        elseif !had_fraction_digs && (is_identifier_start_char(pc) ||
                                      pc == '(' || pc == '[' || pc == '{' ||
                                      pc == '@' || pc == '`' || pc == '"')
            return emit(l, K"ErrorAmbiguousNumericDotMultiply") # `1.(` `1.x`
        end
    elseif (pc == 'e' || pc == 'E' || pc == 'f') && (isdigit(ppc) || ppc == '+' || ppc == '-' || ppc == '−')
        kind = pc == 'f' ? K"Float32" : K"Float"
        readchar(l)
        accept(l, "+-−")
        if accept_batch(l, isdigit)
            pc,ppc = dpeekchar(l)
            if pc === '.' && !is_dottable_operator_start_char(ppc)
                accept(l, '.')
                return emit(l, K"ErrorInvalidNumericConstant") # `1e1.`
            end
        else
            return emit(l, K"ErrorInvalidNumericConstant") # `1e+`
        end
    elseif position(l) - startpos(l) == 1 && l.chars[1] == '0'
        kind == K"Integer"
        is_bin_oct_hex_int = false
        if pc == 'x'
            kind = K"HexInt"
            isfloat = false
            readchar(l)
            had_digits = accept_number(l, ishex)
            pc,ppc = dpeekchar(l)
            if pc == '.' && ppc != '.'
                readchar(l)
                had_digits |= accept_number(l, ishex)
                isfloat = true
            end
            if accept(l, "pP")
                kind = K"Float"
                accept(l, "+-−")
                if !accept_number(l, isdigit) || !had_digits
                    return emit(l, K"ErrorInvalidNumericConstant") # `0x1p` `0x.p0`
                end
            elseif isfloat
                return emit(l, K"ErrorHexFloatMustContainP") # `0x.` `0x1.0`
            end
            is_bin_oct_hex_int = !isfloat
        elseif pc == 'b'
            readchar(l)
            had_digits = accept_number(l, isbinary)
            kind = K"BinInt"
            is_bin_oct_hex_int = true
        elseif pc == 'o'
            readchar(l)
            had_digits = accept_number(l, isoctal)
            kind = K"OctInt"
            is_bin_oct_hex_int = true
        end
        if is_bin_oct_hex_int
            pc = peekchar(l)
            if !had_digits || isdigit(pc) || is_identifier_start_char(pc)
                accept_batch(l, c->isdigit(c) || is_identifier_start_char(c))
                # `0x` `0xg` `0x_` `0x-`
                # `0b123` `0o78p` `0xenomorph` `0xaα`
                return emit(l, K"ErrorInvalidNumericConstant")
            end
        end
    end
    return emit(l, kind)
end

function lex_prime(l)
    if l.last_token == K"Identifier"         ||
         is_contextual_keyword(l.last_token) ||
         is_word_operator(l.last_token)      ||
         l.last_token == K"."                ||
         l.last_token ==  K")"               ||
         l.last_token ==  K"]"               ||
         l.last_token ==  K"}"               ||
         l.last_token == K"'"                ||
         l.last_token == K"end"              ||
         is_literal(l.last_token)
        # FIXME ^ This doesn't cover all cases - probably needs involvement
        # from the parser state.
        return emit(l, K"'")
    else
        push!(l.string_states, StringState(false, true, '\'', 0))
        return emit(l, K"'", false)
    end
end

function lex_amper(l::Lexer)
    if accept(l, '&')
        return emit(l, K"&&")
    elseif accept(l, '=')
        return emit(l, K"&=")
    else
        return emit(l, K"&")
    end
end

# Parse a token starting with a quote.
# A '"' has been consumed
function lex_quote(l::Lexer)
    raw = l.last_token == K"Identifier" ||
          is_contextual_keyword(l.last_token) ||
          is_word_operator(l.last_token)
    pc, dpc = dpeekchar(l)
    triplestr = pc == '"' && dpc == '"'
    push!(l.string_states, StringState(triplestr, raw, '"', 0))
    if triplestr
        readchar(l)
        readchar(l)
        emit(l, K"\"\"\"")
    else
        emit(l, K"\"")
    end
end

function string_terminates(l, delim::Char, triplestr::Bool)
    if triplestr
        c1, c2, c3 = peekchar3(l)
        c1 === delim && c2 === delim && c3 === delim
    else
        peekchar(l) === delim
    end
end

# Parse a token starting with a forward slash.
# A '/' has been consumed
function lex_forwardslash(l::Lexer)
    if accept(l, '/')
        if accept(l, '=')
            return emit(l, K"//=")
        else
            return emit(l, K"//")
        end
    elseif accept(l, '=')
        return emit(l, K"/=")
    else
        return emit(l, K"/")
    end
end

function lex_backslash(l::Lexer)
    if accept(l, '=')
        return emit(l, K"\=")
    end
    return emit(l, K"\\")
end

function lex_dot(l::Lexer)
    if accept(l, '.')
        if accept(l, '.')
            return emit(l, K"...")
        else
            if is_dottable_operator_start_char(peekchar(l))
                readchar(l)
                return emit(l, K"ErrorInvalidOperator")
            else
                return emit(l, K"..")
            end
        end
    elseif Base.isdigit(peekchar(l))
        return lex_digit(l, K"Float")
    else
        pc, dpc = dpeekchar(l)
        if pc == '+'
            l.dotop = true
            readchar(l)
            return lex_plus(l)
        elseif pc =='-'
            l.dotop = true
            readchar(l)
            return lex_minus(l)
        elseif pc == '−'
            l.dotop = true
            readchar(l)
            return emit(l, accept(l, '=') ? K"-=" : K"-")
        elseif pc =='*'
            l.dotop = true
            readchar(l)
            return lex_star(l)
        elseif pc =='/'
            l.dotop = true
            readchar(l)
            return lex_forwardslash(l)
        elseif pc =='\\'
            l.dotop = true
            readchar(l)
            return lex_backslash(l)
        elseif pc =='^'
            l.dotop = true
            readchar(l)
            return lex_circumflex(l)
        elseif pc =='<'
            l.dotop = true
            readchar(l)
            return lex_less(l)
        elseif pc =='>'
            l.dotop = true
            readchar(l)
            return lex_greater(l)
        elseif pc =='&'
            l.dotop = true
            readchar(l)
            if accept(l, '=')
                return emit(l, K"&=")
            else
                if accept(l, '&')
                    return emit(l, K"&&")
                end
                return emit(l, K"&")
            end
        elseif pc =='%'
            l.dotop = true
            readchar(l)
            return lex_percent(l)
        elseif pc == '=' && dpc != '>'
            l.dotop = true
            readchar(l)
            return lex_equal(l)
        elseif pc == '|'
            l.dotop = true
            readchar(l)
            if accept(l, '|')
                return emit(l, K"||")
            end
            return lex_bar(l)
        elseif pc == '!' && dpc == '='
            l.dotop = true
            readchar(l)
            return lex_exclaim(l)
        elseif pc == '⊻'
            l.dotop = true
            readchar(l)
            return lex_xor(l)
        elseif pc == '÷'
            l.dotop = true
            readchar(l)
            return lex_division(l)
        elseif pc == '=' && dpc == '>'
            l.dotop = true
            readchar(l)
            return lex_equal(l)
        elseif is_dottable_operator_start_char(pc)
            l.dotop = true
            return _next_token(l, readchar(l))
        end
        return emit(l, K".")
    end
end

# A ` has been consumed
function lex_backtick(l::Lexer)
    pc, dpc = dpeekchar(l)
    triplestr = pc == '`' && dpc == '`'
    # Backticks always contain raw strings only. See discussion on bug
    # https://github.com/JuliaLang/julia/issues/3150
    raw = true
    push!(l.string_states, StringState(triplestr, raw, '`', 0))
    if triplestr
        readchar(l)
        readchar(l)
        emit(l, K"```")
    else
        emit(l, K"`")
    end
end

const MAX_KW_LENGTH = 10
function lex_identifier(l::Lexer, c)
    h = simple_hash(c, UInt64(0))
    n = 1
    while true
        pc, ppc = dpeekchar(l)
        if (pc == '!' && ppc == '=') || !is_identifier_char(pc)
            break
        end
        c = readchar(l)
        h = simple_hash(c, h)
        n += 1
    end

    if n > MAX_KW_LENGTH
        emit(l, K"Identifier")
    else
        emit(l, get(kw_hash, h, K"Identifier"))
    end
end

# This creates a hash for chars in [a-z] using 5 bit per char.
# Requires an additional input-length check somewhere, because
# this only works up to ~12 chars.
@inline function simple_hash(c::Char, h::UInt64)
    bytehash = (clamp(c - 'a' + 1, -1, 30) % UInt8) & 0x1f
    h << 5 + bytehash
end

function simple_hash(str)
    ind = 1
    h = UInt64(0)
    while ind <= length(str)
        h = simple_hash(str[ind], h)
        ind = nextind(str, ind)
    end
    h
end

kws = [
K"baremodule",
K"begin",
K"break",
K"catch",
K"const",
K"continue",
K"do",
K"else",
K"elseif",
K"end",
K"export",
K"finally",
K"for",
K"function",
K"global",
K"if",
K"import",
K"let",
K"local",
K"macro",
K"module",
K"public",
K"quote",
K"return",
K"struct",
K"try",
K"using",
K"while",
K"in",
K"isa",
K"where",
K"true",
K"false",

K"abstract",
K"as",
K"doc",
K"mutable",
K"outer",
K"primitive",
K"type",
K"var",
]

const kw_hash = Dict(simple_hash(lowercase(string(kw))) => kw for kw in kws)

end # module