open-axiom repository from github
// -*- C++ -*-
// Copyright (C) 2013-2014, Gabriel Dos Reis.
// All rights reserved.
// Written by Gabriel Dos Reis.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// - Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// - Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in
// the documentation and/or other materials provided with the
// distribution.
//
// - Neither the name of OpenAxiom. nor the names of its contributors
// may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
// OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef OPENAXIOM_TOKEN_included
#define OPENAXIOM_TOKEN_included
#include <stdint.h>
#include <stack>
#include <iosfwd>
#include <open-axiom/Input>
#include <open-axiom/dialect>
namespace OpenAxiom {
// Categorization of Boot and Spad tokens.
enum class TokenCategory : uint8_t {
Unclassified, // token of unknown class
Whitespace, // sequence of white-space characters
Comment, // a description of an ignorable comment
Punctuator, // a punctuator character
Operator, // an operator both symbolic and alphabetic
Integer, // an integer literal
FloatingPoint, // a floating-point literal
String, // a string literal
Keyword, // a reserved word both symbolic and alphabetic
Identifier, // an identifier
Formatting, // a layout formatting token
Junk, // invalid/malformed token
EOS // end-of-token-stream indicator
};
std::ostream& operator<<(std::ostream&, TokenCategory);
// The abstract value associated with a token.
enum class TokenValue : uint8_t {
#undef OPENAXIOM_DEFINE_TOKEN
#define OPENAXIOM_DEFINE_TOKEN(T, ...) T,
#include <open-axiom/token-value>
#undef OPENAXIOM_DEFINE_TOKEN
Artificial, // Tokens after this are artificial
Indent, // new line indentation, greater than previous
Unindent, // new line indentation, less than previous
Justify, // align indentation with preceding line.
EndOfStream // end of token stream
};
std::ostream& operator<<(std::ostream&, TokenValue);
// Datatypes for locating lines and columns.
using LineNumber = std::size_t;
using ColumnIndex = std::size_t;
struct Locus {
LineNumber line;
ColumnIndex column;
};
std::ostream& operator<<(std::ostream&, const Locus&);
// Program text region
struct Region {
Locus start;
Locus end;
};
// Given a symbolic or alphabetic token, retrieve its category
// and associated abstract value.
struct TokenClassification {
TokenCategory category;
TokenValue value;
explicit operator bool() const {
return category != TokenCategory::EOS;
}
};
TokenClassification classify(const std::string&);
// Token data structure: a region of text with a classification.
struct Token : TokenClassification, Region {
using Location = Locus;
};
// Cursor into a fragment.
struct FragmentCursor {
std::size_t line; // index of a line in a fragment
std::size_t column; // column number at line.
inline FragmentCursor& operator++() {
++column;
return *this;
}
inline FragmentCursor operator++(int) {
auto tmp = *this;
++*this;
return tmp;
}
inline FragmentCursor& operator--() {
--column;
return *this;
}
inline FragmentCursor operator--(int) {
auto tmp = *this;
--*this;
return tmp;
}
};
// -- Exception types
struct EndOfStringUnseen {
LineNumber line;
ColumnIndex column;
};
struct MissingExponent {
LineNumber line;
ColumnIndex column;
};
// Object of this datatype decompose a program fragment into a
// token stream. The tokens are of type indicated by Tok.
template<typename Frag, typename Tok>
struct Tokenizer {
Tokenizer(Frag& f)
: frag(f),
pos{ 0, frag.front().indent }
{
indents.push(pos.column);
}
bool eos() const {
return pos.line >= frag.size()
or (pos.line + 1 == frag.size() and pos.column >= frag.back().size());
}
Tok get(Language = Language::Spad);
private:
Frag& frag;
FragmentCursor pos;
std::stack<ColumnIndex> indents;
std::size_t line_length() const { return frag(pos).size(); }
LineNumber next_line_number() const {
return pos.line + 1 < frag.size()
? frag[pos.line + 1].number
: frag.back().number + 1;
}
ColumnIndex next_indentation() const {
return pos.line + 1 < frag.size() ? frag[pos.line + 1].indent : 0;
}
LineNumber line_number() const {
return pos.line < frag.size()
? frag(pos).number
: frag.back().number + 1;
}
ColumnIndex column_number() const {
return pos.line < frag.size() ? pos.column : 0;
}
using Locus = typename Tok::Location;
Locus current_locus() {
return { line_number(), column_number() };
}
};
bool separator_or_punctuator(uint8_t);
template<typename T>
inline void comment_token(T& t, TokenValue v) {
t.category = TokenCategory::Comment;
t.value = v;
}
template<typename T>
inline void operator_token(T& t, TokenValue v) {
t.category = TokenCategory::Operator;
t.value = v;
}
template<typename T>
inline void punctuator_token(T& t, TokenValue v) {
t.category = TokenCategory::Punctuator;
t.value = v;
}
template<typename L, typename T>
static void junk(L& line, ColumnIndex& idx, T& t) {
while (idx < line.size() and not separator_or_punctuator(line[idx]))
++idx;
t.category = TokenCategory::Junk;
}
template<typename L>
inline void
skip_whitespace(L& line, ColumnIndex& idx) {
while (idx < line.size() and isblank(line[idx]))
++idx;
}
template<typename Frag, typename Tok>
void string_literal(Frag& frag, FragmentCursor& pos, Tok& t) {
bool done = false;
bool escape = false;
while (frag.covering(pos) && not done) {
switch (frag(pos)[pos.column++]) {
case '"': done = !escape;
// fallthrough
default: escape = false; break;
case '_':
if (pos.column == frag(pos).size()
and pos.line < frag.size() - 1) {
++pos.line;
pos.column = 0;
}
else
escape = !escape;
break;
}
}
if (not done)
throw EndOfStringUnseen{ frag(pos).number, pos.column };
t.category = TokenCategory::String;
}
template<typename L>
void skip_to_end_of_integer(L& line, ColumnIndex& idx) {
while (idx < line.size() and isdigit(line[idx]))
++idx;
}
template<typename L, typename T>
void integer(L& line, ColumnIndex& idx, T& t) {
skip_to_end_of_integer(line, idx);
t.category = TokenCategory::Integer;
}
template<typename L, typename T>
T& number(L& line, ColumnIndex& idx, T& t) {
integer(line, idx, t);
if (idx >= line.size() or line[idx] != '.')
return t;
if (++idx >= line.size() or not isdigit(line[idx])) {
--idx;
return t;
}
t.category = TokenCategory::FloatingPoint;
skip_to_end_of_integer(line, idx);
if (idx >= line.size() or (line[idx] != 'e' and line[idx] != 'E'))
return t;
if (++idx < line.size() and (line[idx] == '+' or line[idx] == '-'))
++idx;
if (idx >= line.size() or not isdigit(line[idx]))
throw MissingExponent{ line.number, idx };
skip_to_end_of_integer(line, idx);
return t;
}
inline bool
identifier_head(uint8_t c) {
return isalpha(c) or c == '%' or c == '_';
}
inline bool
identifier_part(uint8_t c) {
return identifier_head(c) or isdigit(c);
}
inline bool
identifier_suffix(uint8_t c) {
return c == '!' or c == '?';
}
inline bool internal_prefix(uint8_t c) {
return c == '%' or c == '$';
}
template<typename L>
inline void
skip_prefix(L& line, ColumnIndex& idx, uint8_t c) {
while (idx < line.size() and line[idx] == c)
++idx;
}
template<typename L, typename T>
T& identifier(L& line, ColumnIndex& idx, T& t, Language dialect) {
t.category = TokenCategory::Identifier;
ColumnIndex start = --idx; // idx was ahead by 1.
if (dialect == Language::Boot and internal_prefix(line[idx]))
skip_prefix(line, idx, line[idx]);
bool saw_escape = false;
while (idx < line.size()) {
if (not identifier_part(line[idx]) and line[idx - 1] != '_')
break;
else if (line[idx] == '_')
saw_escape = true;
++idx;
}
while (idx < line.size() and identifier_suffix(line[idx]))
++idx;
if (!saw_escape) {
auto info = classify(line.sub_string(start, idx));
if (info.category != TokenCategory::Identifier) {
t.category = info.category;
t.value = info.value;
}
}
return t;
}
template<typename Frag, typename Tok>
static void
left_paren_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
punctuator_token(t, TokenValue::OpenParen);
if (frag.covering(pos) and frag[pos] == '|') {
++pos;
t.value = TokenValue::OpenMetaParen;
}
}
template<typename Frag, typename Tok>
static void
left_brace_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
punctuator_token(t, TokenValue::OpenBrace);
if (frag.covering(pos) and frag[pos] == '|') {
++pos;
t.value = TokenValue::OpenMetaBrace;
}
}
template<typename Frag, typename Tok>
static void
left_bracket_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
punctuator_token(t, TokenValue::OpenBracket);
if (frag.covering(pos) and frag[pos] == '|') {
++pos;
t.value = TokenValue::OpenMetaBracket;
}
}
template<typename Frag, typename Tok>
static void
colon_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Colon);
if (frag.covering(pos))
switch (frag[pos]) {
case ':': t.value = TokenValue::ColonColon; ++pos; break;
case '=': t.value = TokenValue::ColonEq; ++pos; break;
case '-': t.value = TokenValue::ColonDash; ++pos; break;
default: break;
}
}
template<typename Frag, typename Tok>
static void
star_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Star);
if (frag.covering(pos) and frag[pos] == '*') {
t.value = TokenValue::StarStar;
++pos;
}
}
template<typename Frag, typename Tok>
static void
slash_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Slash);
if (frag.covering(pos))
switch (frag[pos]) {
case '/': t.value = TokenValue::SlashSlash; ++pos; break;
case '\\': t.value = TokenValue::SlashBackslash; ++pos; break;
default: break;
}
}
template<typename Frag, typename Tok>
static void
backslash_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Backslash);
if (frag.covering(pos))
switch (frag[pos]) {
case '\\': t.value = TokenValue::BackslashBackslash; ++pos; break;
case '/': t.value = TokenValue::BackslashSlash; ++pos; break;
default: break;
}
}
template<typename Frag, typename Tok>
static void
less_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Less);
if (frag.covering(pos))
switch (frag[pos]) {
case '-': t.value = TokenValue::LeftArrow; ++pos; break;
case '<': t.value = TokenValue::OpenChevron; ++pos; break;
case '=':
t.value = TokenValue::LessEq;
if (frag.covering(++pos) and frag[pos] == '>') {
t.value = TokenValue::Equiv;
++pos;
}
break;
default: break;
}
}
template<typename Frag, typename Tok>
static void
equal_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Eq);
if (frag.covering(pos))
switch (frag[pos]) {
case '>': t.value = TokenValue::Implies; ++pos; break;
case '=':
t.value = TokenValue::EqEq;
if (frag.covering(++pos) and frag[pos] == '>') {
t.value = TokenValue::FatArrow;
++pos;
}
break;
default: break;
}
}
template<typename Frag, typename Tok>
static void
tilde_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Tilde);
if (frag.covering(pos) and frag[pos] == '=') {
t.value = TokenValue::TildeEq;
++pos;
}
}
template<typename Frag, typename Tok>
static void
greater_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Greater);
if (frag.covering(pos))
switch (frag[pos]) {
case '=': t.value = TokenValue::GreaterEq; ++pos; break;
case '>': t.value = TokenValue::CloseChevron; ++pos; break;
}
}
template<typename Frag, typename Tok>
static void
bar_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
punctuator_token(t, TokenValue::Bar);
if (frag.covering(pos))
switch (frag[pos]) {
case ']': t.value = TokenValue::CloseMetaBracket; ++pos; break;
case '}': t.value = TokenValue::CloseMetaBrace; ++pos; break;
case ')': t.value = TokenValue::CloseMetaParen; ++pos; break;
default: break;
}
}
template<typename Frag, typename Tok>
static void
minus_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Minus);
if (frag.covering(pos))
switch (frag[pos]) {
case '>': t.value = TokenValue::RightArrow; ++pos; break;
case '-':
comment_token(t, TokenValue::Wisecrack);
pos.column = frag(pos).size();
break;
}
}
template<typename Frag, typename Tok>
static void
plus_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Plus);
if (frag.covering(pos))
switch (frag[pos]) {
case '+':
comment_token(t, TokenValue::Commentary);
pos.column = frag(pos).size();
break;
case '-':
if (pos.column + 1 < frag(pos).size()
and frag(pos)[pos.column + 1] == '>') {
t.value = TokenValue::MapsTo;
pos.column += 2;
}
break;
default: break;
}
}
template<typename Frag, typename Tok>
static void
dot_et_al(Frag& frag, FragmentCursor& pos, Tok& t) {
operator_token(t, TokenValue::Dot);
if (frag.covering(pos) and frag[pos] == '.') {
t.value = TokenValue::DotDot;
++pos;
}
}
template<typename Frag, typename Tok>
static void
dollar_et_al(Frag& frag, FragmentCursor& pos, Tok& t, Language dialect) {
if (dialect != Language::Boot or not frag.covering(pos)
or separator_or_punctuator(frag[pos]))
operator_token(t, TokenValue::Dollar);
else
identifier(frag(pos), pos.column, t, dialect);
}
template<typename Frag, typename Tok>
static void
sharp_et_al(Frag& frag, FragmentCursor& pos, Tok& t, Language dialect) {
if (dialect != Language::Lisp)
operator_token(t, TokenValue::Sharp);
else if (frag.covering(pos))
switch (frag[pos++]) {
case '(': punctuator_token(t, TokenValue::SharpLeftParen); break;
case '\'': operator_token(t, TokenValue::SharpApostrophe); break;
case ':': operator_token(t, TokenValue::SharpColon); break;
case '+': punctuator_token(t, TokenValue::SharpPlus); break;
case '-': punctuator_token(t, TokenValue::SharpMinus); break;
case '.': operator_token(t, TokenValue::SharpDot); break;
default: --pos; break;
}
}
template<typename Frag, typename Tok>
Tok Tokenizer<Frag, Tok>::get(Language dialect) {
Tok t { };
t.start = current_locus();
if (eos()) {
t.category = TokenCategory::EOS;
t.end = current_locus();
return t;
}
else if (isblank(frag[pos])) {
skip_whitespace(frag(pos), pos.column);
t.category = TokenCategory::Whitespace;
t.end = current_locus();
return t;
}
else if (pos.column == line_length() - 1 and frag(pos).back() == '_') {
++pos.line;
pos.column = frag(pos).indent;
}
else if (pos.column == line_length()) {
auto indent = indents.top();
auto next_indent = next_indentation();
t.start = t.end = { next_line_number(), next_indent };
if (indent < next_indent) {
indents.push(next_indent);
++pos.line;
pos.column = next_indent;
t.category = TokenCategory::Formatting;
t.value = TokenValue::Indent;
}
else if (indent > next_indent) {
indents.pop();
t.category = TokenCategory::Formatting;
t.value = TokenValue::Unindent;
}
else {
++pos.line;
pos.column = next_indent;
t.category = TokenCategory::Formatting;
t.value = TokenValue::Justify;
}
return t;
}
switch (auto c = frag.advance(pos)) {
case '#': sharp_et_al(frag, pos, t, dialect); break;
case '@': operator_token(t, TokenValue::At); break;
case '^': operator_token(t, TokenValue::Caret); break;
case '&': punctuator_token(t, TokenValue::Ampersand); break;
case '!': punctuator_token(t, TokenValue::Exclamation); break;
case '\'': punctuator_token(t, TokenValue::Apostrophe); break;
case ',': punctuator_token(t, TokenValue::Comma); break;
case ';': punctuator_token(t, TokenValue::Semicolon); break;
case '`': punctuator_token(t, TokenValue::Backquote); break;
case '(': left_paren_et_al(frag, pos, t); break;
case ')': punctuator_token(t, TokenValue::CloseParen); break;
case '{': left_brace_et_al(frag, pos, t); break;
case '}': punctuator_token(t, TokenValue::CloseBrace); break;
case '[': left_bracket_et_al(frag, pos, t); break;
case ']': punctuator_token(t, TokenValue::CloseBracket); break;
case ':': colon_et_al(frag, pos, t); break;
case '*': star_et_al(frag, pos, t); break;
case '/': slash_et_al(frag, pos, t); break;
case '\\': backslash_et_al(frag, pos, t); break;
case '<': less_et_al(frag, pos, t); break;
case '=': equal_et_al(frag, pos, t); break;
case '~': tilde_et_al(frag, pos, t); break;
case '>': greater_et_al(frag, pos, t); break;
case '|': bar_et_al(frag, pos, t); break;
case '-': minus_et_al(frag, pos, t); break;
case '+': plus_et_al(frag, pos, t); break;
case '.': dot_et_al(frag, pos, t); break;
case '"': string_literal(frag, pos, t); break;
case '$': dollar_et_al(frag, pos, t, dialect); break;
default:
if (isdigit(c))
number(frag(pos), pos.column, t);
else if (identifier_head(c))
identifier(frag(pos), pos.column, t, dialect);
else
junk(frag(pos), pos.column, t);
break;
}
t.end = { frag(pos).number, pos.column };
return t;
}
// -- Token streams.
template<typename T>
struct TokenStream : std::vector<T> {
template<typename Frag>
explicit TokenStream(Frag& f, Language dialect = Language::Spad) {
Tokenizer<Frag, T> lex { f };
while (auto t = lex.get(dialect))
this->push_back(t);
}
};
}
#endif // OPENAXIOM_TOKEN_included