Change-Id: I461abfda8b7297c1bd04f1090f8d58fa96364616 Reviewed-on: https://galileo.mailstation.de/gerrit/1892 Reviewed-by: Saleem Abdulrasool <compnerd@compnerd.org> Reviewed-by: Kylie McClain <somasis@exherbo.org>
399 lines
12 KiB
C++
399 lines
12 KiB
C++
/* vim: set sw=4 sts=4 et foldmethod=syntax : */
|
|
|
|
/*
|
|
* Copyright (c) 2006, 2007, 2010 Ciaran McCreesh
|
|
*
|
|
* This file is part of the Paludis package manager. Paludis is free software;
|
|
* you can redistribute it and/or modify it under the terms of the GNU General
|
|
* Public License version 2, as published by the Free Software Foundation.
|
|
*
|
|
* Paludis is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
|
|
* details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License along with
|
|
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
|
|
* Place, Suite 330, Boston, MA 02111-1307 USA
|
|
*/
|
|
|
|
#ifndef PALUDIS_GUARD_PALUDIS_TOKENISER_HH
|
|
#define PALUDIS_GUARD_PALUDIS_TOKENISER_HH 1
|
|
|
|
#include <iterator>
|
|
#include <paludis/util/exception.hh>
|
|
#include <paludis/util/stringify.hh>
|
|
#include <string>
|
|
|
|
/** \file
|
|
* Declarations for Tokeniser and related utilities.
|
|
*
|
|
* \ingroup g_strings
|
|
*
|
|
* \section Examples
|
|
*
|
|
* - None at this time.
|
|
*/
|
|
|
|
namespace paludis
|
|
{
|
|
/**
|
|
* Delimiter policy for Tokeniser.
|
|
*
|
|
* \ingroup g_strings
|
|
*/
|
|
namespace delim_kind
|
|
{
|
|
/**
|
|
* Any of the characters split, and the delimiter is discarded.
|
|
*
|
|
* \ingroup g_strings
|
|
*/
|
|
struct AnyOfTag;
|
|
}
|
|
|
|
/**
|
|
* Delimiter mode for Tokeniser.
|
|
*
|
|
* \ingroup g_strings
|
|
*/
|
|
namespace delim_mode
|
|
{
|
|
/**
|
|
* Discard the delimiters.
|
|
*
|
|
* \ingroup g_strings
|
|
*/
|
|
struct DelimiterTag;
|
|
|
|
/**
|
|
* Keep the delimiters.
|
|
*
|
|
* \ingroup g_strings
|
|
*/
|
|
struct BoundaryTag;
|
|
}
|
|
|
|
/**
|
|
* Tokeniser internal use only.
|
|
*
|
|
* \ingroup g_strings
|
|
*/
|
|
namespace tokeniser_internals
|
|
{
|
|
/**
|
|
* A Writer handles Tokeniser's writes.
|
|
*
|
|
* \ingroup g_strings
|
|
*/
|
|
template <typename DelimMode_, typename Iter_>
|
|
struct Writer;
|
|
|
|
/**
|
|
* A Writer handles Tokeniser's writes (specialisation for
|
|
* delim_mode::DelimiterTag).
|
|
*
|
|
* \ingroup g_strings
|
|
*/
|
|
template <typename Iter_>
|
|
struct Writer<delim_mode::DelimiterTag, Iter_>
|
|
{
|
|
/**
|
|
* Handle a token.
|
|
*/
|
|
static void handle_token(const std::string & s, Iter_ & i)
|
|
{
|
|
*i++ = s;
|
|
}
|
|
|
|
/**
|
|
* Handle a delimiter.
|
|
*/
|
|
static void handle_delim(const std::string &, const Iter_ &)
|
|
{
|
|
}
|
|
};
|
|
|
|
/**
|
|
* A Writer handles Tokeniser's writes (specialisation for
|
|
* delim_mode::BoundaryTag).
|
|
*
|
|
* \ingroup g_strings
|
|
*/
|
|
template <typename Iter_>
|
|
struct Writer<delim_mode::BoundaryTag, Iter_>
|
|
{
|
|
/**
|
|
* Handle a token.
|
|
*/
|
|
static void handle_token(const std::string & s, Iter_ & i)
|
|
{
|
|
*i++ = s;
|
|
}
|
|
|
|
/**
|
|
* Handle a delimiter.
|
|
*/
|
|
static void handle_delim(const std::string & s, Iter_ & i)
|
|
{
|
|
*i++ = s;
|
|
}
|
|
};
|
|
|
|
struct Lexer
|
|
{
|
|
const std::string text;
|
|
std::string::size_type text_pos;
|
|
std::string delims;
|
|
const std::string quotes;
|
|
|
|
std::string value;
|
|
enum { t_quote, t_delim, t_text } kind;
|
|
|
|
Lexer(const std::string & t, const std::string & d, const std::string & q) :
|
|
text(t),
|
|
text_pos(0),
|
|
delims(d),
|
|
quotes(q)
|
|
{
|
|
}
|
|
|
|
bool next()
|
|
{
|
|
if (text_pos >= text.length())
|
|
return false;
|
|
|
|
if (std::string::npos != delims.find(text[text_pos]))
|
|
{
|
|
std::string::size_type start_pos(text_pos);
|
|
while (++text_pos < text.length())
|
|
if (std::string::npos == delims.find(text[text_pos]))
|
|
break;
|
|
|
|
value = text.substr(start_pos, text_pos - start_pos);
|
|
kind = t_delim;
|
|
}
|
|
else if (std::string::npos != quotes.find(text[text_pos]))
|
|
{
|
|
value = std::string(1, text[text_pos]);
|
|
kind = t_quote;
|
|
++text_pos;
|
|
}
|
|
else
|
|
{
|
|
std::string::size_type start_pos(text_pos);
|
|
while (++text_pos < text.length())
|
|
if (std::string::npos != delims.find(text[text_pos]))
|
|
break;
|
|
else if (std::string::npos != quotes.find(text[text_pos]))
|
|
break;
|
|
value = text.substr(start_pos, text_pos - start_pos);
|
|
kind = t_text;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
};
|
|
|
|
template <typename DelimKind_, typename DelimMode_ = delim_mode::DelimiterTag>
|
|
class Tokeniser;
|
|
|
|
template <typename DelimMode_>
|
|
class Tokeniser<delim_kind::AnyOfTag, DelimMode_>
|
|
{
|
|
private:
|
|
Tokeniser();
|
|
|
|
public:
|
|
template <typename Iter_>
|
|
static void tokenise(const std::string & s,
|
|
const std::string & delims,
|
|
const std::string & quotes,
|
|
Iter_ iter);
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Thrown if a Tokeniser encounters a syntax error (for example, mismatched quotes).
|
|
*
|
|
* \ingroup g_strings
|
|
* \since 0.26
|
|
*/
|
|
class PALUDIS_VISIBLE TokeniserError :
|
|
public Exception
|
|
{
|
|
public:
|
|
///\name Basic operations
|
|
///\{
|
|
|
|
TokeniserError(const std::string & s, const std::string & msg) noexcept;
|
|
|
|
///\}
|
|
};
|
|
|
|
template <typename DelimMode_>
|
|
template <typename Iter_>
|
|
void
|
|
tokeniser_internals::Tokeniser<delim_kind::AnyOfTag, DelimMode_>::tokenise(
|
|
const std::string & s,
|
|
const std::string & delims,
|
|
const std::string & quotes,
|
|
Iter_ iter)
|
|
{
|
|
typedef tokeniser_internals::Lexer Lexer;
|
|
Lexer l(s, delims, quotes);
|
|
|
|
enum { s_initial, s_had_quote, s_had_text, s_had_quote_text, s_had_quote_text_quote } state = s_initial;
|
|
|
|
while (l.next())
|
|
{
|
|
switch (state)
|
|
{
|
|
case s_initial:
|
|
switch (l.kind)
|
|
{
|
|
case Lexer::t_quote:
|
|
state = s_had_quote;
|
|
l.delims = "";
|
|
break;
|
|
|
|
case Lexer::t_delim:
|
|
state = s_initial;
|
|
tokeniser_internals::Writer<DelimMode_, Iter_>::handle_delim(l.value, iter);
|
|
break;
|
|
|
|
case Lexer::t_text:
|
|
state = s_had_text;
|
|
tokeniser_internals::Writer<DelimMode_, Iter_>::handle_token(l.value, iter);
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case s_had_quote:
|
|
switch (l.kind)
|
|
{
|
|
case Lexer::t_quote:
|
|
state = s_had_quote_text_quote;
|
|
l.delims = delims;
|
|
tokeniser_internals::Writer<DelimMode_, Iter_>::handle_token("", iter);
|
|
break;
|
|
|
|
case Lexer::t_delim:
|
|
throw InternalError(PALUDIS_HERE, "t_delim in s_had_quote");
|
|
break;
|
|
|
|
case Lexer::t_text:
|
|
state = s_had_quote_text;
|
|
tokeniser_internals::Writer<DelimMode_, Iter_>::handle_token(l.value, iter);
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case s_had_quote_text:
|
|
switch (l.kind)
|
|
{
|
|
case Lexer::t_text:
|
|
throw InternalError(PALUDIS_HERE, "t_text in s_had_quote_text");
|
|
break;
|
|
|
|
case Lexer::t_delim:
|
|
throw InternalError(PALUDIS_HERE, "t_delim in s_had_quote_text");
|
|
break;
|
|
|
|
case Lexer::t_quote:
|
|
state = s_had_quote_text_quote;
|
|
l.delims = delims;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case s_had_quote_text_quote:
|
|
switch (l.kind)
|
|
{
|
|
case Lexer::t_text:
|
|
throw TokeniserError(s, "Close quote followed by text");
|
|
break;
|
|
|
|
case Lexer::t_quote:
|
|
throw TokeniserError(s, "Close quote followed by quote");
|
|
break;
|
|
|
|
case Lexer::t_delim:
|
|
state = s_initial;
|
|
tokeniser_internals::Writer<DelimMode_, Iter_>::handle_delim(l.value, iter);
|
|
break;
|
|
}
|
|
break;
|
|
|
|
case s_had_text:
|
|
switch (l.kind)
|
|
{
|
|
case Lexer::t_text:
|
|
throw InternalError(PALUDIS_HERE, "t_text in s_had_text");
|
|
break;
|
|
|
|
case Lexer::t_quote:
|
|
throw TokeniserError(s, "Text followed by quote");
|
|
break;
|
|
|
|
case Lexer::t_delim:
|
|
state = s_initial;
|
|
tokeniser_internals::Writer<DelimMode_, Iter_>::handle_delim(l.value, iter);
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
switch (state)
|
|
{
|
|
case s_initial:
|
|
case s_had_text:
|
|
case s_had_quote_text_quote:
|
|
return;
|
|
|
|
case s_had_quote:
|
|
case s_had_quote_text:
|
|
throw TokeniserError(s, "Unterminated quoted string");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Tokenise a string.
|
|
*
|
|
* \ingroup g_strings
|
|
* \since 0.26
|
|
*/
|
|
template <typename DelimKind_, typename DelimMode_, typename Iter_>
|
|
void tokenise(const std::string & s, const std::string & delims, const std::string & quotes, Iter_ iter)
|
|
{
|
|
tokeniser_internals::Tokeniser<DelimKind_, DelimMode_>::template tokenise<Iter_>(s, delims, quotes, iter);
|
|
}
|
|
|
|
/**
|
|
* Convenience function: tokenise on whitespace.
|
|
*
|
|
* \ingroup g_strings
|
|
* \since 0.26
|
|
*/
|
|
template <typename Iter_>
|
|
void tokenise_whitespace(const std::string & s, Iter_ iter)
|
|
{
|
|
tokenise<delim_kind::AnyOfTag, delim_mode::DelimiterTag>(s, " \t\r\n", "", iter);
|
|
}
|
|
|
|
/**
|
|
* Convenience function: tokenise on whitespace, handling quoted strings.
|
|
*
|
|
* \ingroup g_strings
|
|
* \since 0.26
|
|
*/
|
|
template <typename Iter_>
|
|
void tokenise_whitespace_quoted(const std::string &s, Iter_ iter)
|
|
{
|
|
tokenise<delim_kind::AnyOfTag, delim_mode::DelimiterTag>(s, " \t\r\n", "'\"", iter);
|
|
}
|
|
}
|
|
|
|
#endif
|