# This library is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This library is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this library; if not, see <http://www.gnu.org/licenses/>.
"""
Koji Smoky Dingo - Sifty Dingo Parser
This is the parser for the Sift Dingo filtering language.
:author: Christopher O'Brien <obriencj@gmail.com>
:license: GPL v3
"""
import re
from abc import ABCMeta
from codecs import decode
from fnmatch import translate
from functools import partial
from io import StringIO
from itertools import chain, product
from typing import Iterable, Iterator, List, Sequence, Sized, Union, cast
from .. import BadDingo
__all__ = (
"AllItems",
"Glob",
"Item",
"ItemMatch",
"ItemPath",
"Matcher",
"Null",
"Number",
"Reader",
"Regex",
"RegexError",
"Symbol",
"SymbolGroup",
"convert_escapes",
"convert_token",
"parse_exprs",
)
class ParserError(BadDingo):
complaint = "Error parsing Sifter"
[docs]
class RegexError(ParserError):
complaint = "Error compiling Regex"
[docs]
class Matcher(metaclass=ABCMeta):
"""
Base class for special comparison types
"""
pass
[docs]
class Null(Matcher):
"""
An empty literal, represented by the symbols ``null`` or
``None``. Matches only with the python None value.
"""
def __eq__(self, val):
return val is None
def __str__(self):
return "null"
def __repr__(self):
return "Null()"
[docs]
class Symbol(str, Matcher):
"""
An unquoted literal series of characters. A symbol can compare
with python str instances.
"""
def __repr__(self):
return f"Symbol({str(self)!r})"
[docs]
class SymbolGroup(Matcher):
"""
A symbol group is a literal symbol with multiple permutations. It is
represented as a symbol containing groups within curly-braces
Examples:
* ``{foo,bar}-candidate`` is equal to foo-candidate and bar-candidate
* ``foo-{1..3}`` is equal to any of foo-1, foo-2, foo-3
"""
def __init__(self, src, groups):
self.src = src
self.groups = groups
def __iter__(self):
for k in map("".join, product(*self.groups)):
if NUMBER_RE == k:
yield Number(k)
else:
yield Symbol(k)
def __eq__(self, val):
return any(map(lambda s: s == val, self))
def __repr__(self):
return f"SymbolGroup({self.src!r})"
class FormattedSeries(Sequence[str]):
"""
A portion of a `SymbolGroup` representing a repeatable formatted
sequence.
"""
def __init__(self, fmt: str, seq: Sequence):
"""
:param fmt: formatting to apply
:param seq: sequence which can safely have `iter` called on it
multiple times
"""
self._fmt = fmt
self._seq = seq
def __getitem__(self, index):
return self._fmt.format(self._seq[index])
def __iter__(self):
return map(self._fmt.format, self._seq)
def __len__(self):
return len(self._seq)
[docs]
class Number(int, Matcher):
"""
A number is a literal made entirely of digits. It can compare with
both the python int and str types.
"""
def __eq__(self, val):
if isinstance(val, str):
if NUMBER_RE == val:
val = int(val)
return int(self) == val
def __repr__(self):
return f"Number({int(self)})"
[docs]
class Regex(Matcher):
"""
A regex is a quoted literal using forward-slashes as quotes
Examples:
* ``/.*foo$/`` is a case-sensitive match for text ending in foo
* ``/.*foo$/i`` is a case-insensitive match for text ending in foo
"""
def __init__(self, src, flags=None):
self._src = src
self._flagstr = flags
fint = sum(getattr(re, c.upper(), 0) for c in flags) if flags else 0
try:
self._re = re.compile(src, fint)
except re.error as exc:
raise RegexError(str(exc))
def __eq__(self, val):
try:
return bool(self._re.findall(val))
except TypeError:
return False
def __str__(self):
return self._src
def __repr__(self):
if self._flagstr:
return f"Regex({self._src!r}, flags={self._flagstr!r})"
else:
return f"Regex({self._src!r})"
[docs]
class Glob(Matcher):
"""
A glob is a quoted literal using pipes as quotes
Examples:
* ``|*foo|`` is a case-sensitive match for text ending in foo
* ``|*foo|i`` is a case-insensitive match for text ending in foo
"""
def __init__(self, src, ignorecase=False):
self._src = src
self._ignorecase = ignorecase
self._re = re.compile(translate(src), re.I if ignorecase else 0)
def __eq__(self, val):
try:
return self._re.match(val) is not None
except TypeError:
return False
def __str__(self):
return self._src
def __repr__(self):
if self._ignorecase:
return f"Glob({self._src!r}, ignorecase=True)"
else:
return f"Glob({self._src!r})"
[docs]
class Item():
"""
Seeks path members by an int or str key.
"""
def __init__(self, key: Union[int, str, slice, Matcher]):
if isinstance(key, int):
key = int(key)
elif isinstance(key, str):
key = str(key)
self.key = key
[docs]
def get(self, d: dict) -> Iterator:
key = self.key
try:
if isinstance(key, slice):
for v in d[key]:
yield v
else:
yield d[key]
except (IndexError, KeyError):
# do not catch TypeError
pass
def __repr__(self):
return f"{type(self).__name__}({self.key!r})"
[docs]
class ItemMatch(Item):
"""
Seeks path members by comparison of keys to a matcher (eg. a `Glob`
or `Regex`)
"""
[docs]
def get(self, d: dict) -> Iterator:
if isinstance(d, dict):
data = d.items()
else:
data = enumerate(d)
key = self.key
for k, v in data:
if key == k:
yield v
[docs]
class AllItems(Item):
"""
Seeks all path members
"""
def __init__(self):
pass
[docs]
def get(self, d: dict) -> Iterator:
if isinstance(d, dict):
return iter(d.values())
else:
return iter(d)
def __repr__(self):
return "AllItems()"
ItemPathSpec = Union[Item, Matcher, str, int, slice]
[docs]
class ItemPath():
"""
Represents a collection of elements inside a nested tree of lists
and dicts
"""
def __init__(self, *paths: ItemPathSpec):
ipaths: List[Item] = []
self.paths = ipaths
for p in paths:
if isinstance(p, Item):
ipaths.append(p)
elif isinstance(p, (str, int, slice)):
ipaths.append(Item(p))
elif isinstance(p, Matcher):
ipaths.append(ItemMatch(p))
else:
msg = f"Unexpected path element in ItemPath: {p!r}"
raise ParserError(msg)
[docs]
def get(self, data: dict) -> Iterator:
work = iter([data])
for element in self.paths:
work = chain(*map(element.get, filter(None, work)))
return work
def __repr__(self):
paths = ", ".join(map(str, self.paths))
return f"ItemPath({paths})"
[docs]
class Reader(StringIO):
def __init__(self, source: str):
# force it to be readonly
super().__init__(source)
[docs]
def peek(self, count: int = 1) -> str:
where = self.tell()
val = self.read(count)
self.seek(where)
return val
def split_symbol_groups(
source: str) -> Iterator[Sequence[str]]:
"""
Invoked to by `convert_token` to split up a symbol into a series
of groups which can then be combined to form a SymbolGroup.
"""
reader = Reader(source)
token: StringIO = None
esc = False
srciter = iter(partial(reader.read, 1), '')
for c in srciter:
if esc:
esc = False
if not token:
token = StringIO()
token.write(c)
continue
elif c == '\\':
esc = True
continue
elif c == '{':
if token:
yield [token.getvalue()]
token = None
yield convert_group(cast(str, parse_quoted(reader, '}')))
continue
else:
if not token:
token = StringIO()
token.write(c)
continue
if token:
yield [token.getvalue()]
token = None
def _trailing_esc(
val: str) -> int:
# a count of trailing escapes, just to figure out if there's an
# odd or even amount (and hence whether there's an unterminated
# escape at the end
return len(val) - len(val.rstrip("\\"))
def convert_group(
grp: str) -> Union[FormattedSeries, List[str]]:
"""
A helper function for `split_symbol_groups`
:param grp: group eg. ``"1,2,3"`` or range specifier eg. ``"1..3"``
"""
if "," not in grp:
if ".." in grp:
return convert_range(grp)
else:
return [f"{{{grp}}}"]
work: List[str] = []
for brk in grp.split(","):
if work and work[-1] and _trailing_esc(work[-1][-1]) & 1:
work[-1] = ",".join((work[-1][:-1], brk))
else:
work.append(brk)
if len(work) == 1:
return [f"{{{work[0]}}}"]
else:
return work
def convert_range(rng: str) -> Union[FormattedSeries, List[str]]:
"""
A helper function for `convert_group` to work with the group range
notation.
range notation can be specified as ``START..STOP`` or as
``START..STOP..STEP``. note that any zero-prefix padding is honored,
and padding will be applied to values that
produces a `FormattedSeries` built on a range instance
if the range specifier is invalid, then returns a list with the
specifier as the only value
:param rng: range specifier, eg ``"1..3"``
"""
broken: List[str] = rng.split("..")
blen = len(broken)
if blen == 2:
start, stop = broken
step: Union[int, str] = 1
elif blen == 3:
start, stop, step = broken
else:
return [f"{{{rng}}}"]
try:
istart = int(start)
istop = int(stop) + 1
istep = int(step)
except ValueError:
return [f"{{{rng}}}"]
sss = (start, stop)
if any(map(lambda v: len(v) > 1 and v.startswith("0"), sss)):
pad_to = max(map(len, sss))
fmt = f"{{0:0{pad_to}d}}"
else:
fmt = "{0:d}"
return FormattedSeries(fmt, range(istart, istop, istep))
[docs]
def parse_exprs(
reader: Reader,
start: str = None,
stop: str = None) -> Iterator:
"""
Simple s-expr parser. Reads from a string or character iterator,
emits expressions as nested lists.
"""
# I've been re-using this code for over a decade. It was
# originally in a command-line tool I wrote named 'deli' which
# worked with del.icio.us for finding and filtering through my
# bookmarks. Then I used it in Spexy and a form of it is the basis
# for Sibilant's parser as well. And now it lives here, in Koji
# Smoky Dingo.
if not (start and stop):
unterminated = True
start = '('
stop = ')'
else:
unterminated = False
# bandit thinks this is a password, haha
token_breaks = f"{start}{stop} [;#|/\"\'\n\r\t" # nosec
token: StringIO = None
esc: str = None
srciter = iter(partial(reader.read, 1), '')
for c in srciter:
if esc:
if not token:
token = StringIO()
if c not in token_breaks:
token.write(esc)
token.write(c)
esc = None
continue
if c == '\\':
esc = c
continue
elif c == '.' and token is None:
yield parse_itempath(reader, None, c)
continue
elif c == '[':
prefix = None
if token:
prefix = token.getvalue()
token = None
yield parse_itempath(reader, prefix, c)
continue
elif c in token_breaks:
if token:
yield convert_token(token.getvalue())
token = None
else:
if not token:
token = StringIO()
token.write(c)
continue
# if we get this far, then c is in token_breaks
if c in ';#':
# comments run to end of line
reader.readline()
elif c == start:
yield list(parse_exprs(reader, start, stop))
elif c == stop:
if unterminated:
raise ParserError(f"Unexpected closing {c!r}")
else:
return
elif c in '\'\"/|':
yield parse_quoted(reader, c)
if unterminated:
# leftovers are therefore allowed
if token:
yield convert_token(token.getvalue())
else:
# we shouldn't have reached this
raise ParserError(f"Unexpected EOF, missing closing {stop!r}")
ESCAPE_SEQUENCE_RE = re.compile(r'''
(\\U........
| \\u....
| \\x..
| \\[0-7]{1,3}
| \\N\{[^}]+\}
| \\[\\'"abfnrtv]
)''', re.UNICODE | re.VERBOSE)
[docs]
def convert_escapes(val: str) -> str:
"""
Decodes common escape sequences embedded in a str
:param val: source str to decode
"""
def descape(m):
return decode(m.group(0), 'unicode-escape')
return ESCAPE_SEQUENCE_RE.sub(descape, val)
NUMBER_RE = Regex(r"^-?\d+$")
[docs]
def convert_token(val: str) -> Union[Matcher, str, bool]:
"""
Converts unquoted values to a `Matcher` instance.
* An all-digit value will become `Number`
* None, null, nil become a `Null`
* True becomes the boolean `True`
* False becomes the boolean `False`
* Use of ``{}`` may become a `SymbolGroup` or `Symbol`
* Everything else becomes a `Symbol`
:param val: token value to be converted
"""
if val in (None, "None", "null", "nil"):
return Null()
elif val is True or val == "True":
# note, we do not use 'in' because 1 would match as True
return True
elif val is False or val == "False":
# note, we do not use 'in' because 0 would match as False
return False
elif NUMBER_RE == val:
return Number(val)
else:
val = convert_escapes(val)
if "{" in val:
grps = list(split_symbol_groups(val))
if all(map(lambda v: len(v) == 1, grps)):
# in cases where there's only one choice in all the
# groups, then we can simply create a single Symbol
# from those merged choices.
val = "".join(str(g[0]) for g in grps)
return Symbol(val)
else:
return SymbolGroup(val, grps)
else:
return Symbol(val)
def parse_itempath(
reader: Reader,
prefix: str = None,
char: str = None) -> ItemPath:
"""
Parses an `ItemPath` definition from the given reader.
:param reader: source reader to parse from
:param prefix: an initial path token to convert as the start of the
path
:param char: the initiating character that has already been read from
the reader, if any
"""
paths: List[ItemPathSpec] = []
if prefix:
paths.append(convert_token(prefix))
if char == '[':
paths.append(parse_index(reader, char))
# bandit thinks this is a password, haha
token_breaks = ' .[]();#|/\"\'\n\r\t' # nosec
token: StringIO = None
esc: str = None
srciter = iter(partial(reader.peek, 1), '')
for c in srciter:
if esc:
if token is None:
token = StringIO()
if c not in token_breaks:
token.write(esc)
token.write(c)
esc = None
elif c == '\\':
esc = c
elif c in token_breaks:
if token:
paths.append(convert_token(token.getvalue()))
token = None
if c == "[":
c = reader.read(1)
paths.append(parse_index(reader, c))
continue
elif c == "]":
c = reader.read(1)
raise ParserError(f"Unexpected closer: {c!r}")
elif c == ".":
pass
else:
break
else:
if token is None:
token = StringIO()
token.write(c)
# actually consume the character from the reader
reader.read(1)
if token:
paths.append(convert_token(token.getvalue()))
token = None
return ItemPath(*paths)
_slice_like = Regex(r"^("
r":|::|"
r"[+-]?\d*:|"
r":[+-]?\d*|"
r"[+-]?\d*:[+-]?\d*|"
r"[+-]?\d*:[+-]?\d*:[+-]?\d*"
r")$")
def convert_slice(val: str) -> slice:
"""
Converted a colon-separated string into a slice. Raises a TypeError
if the elements do not convert cleanly to integers
Examples:
* val of ``1:`` results in ``slice(1, None, None)``
* val of ``:1`` results in ``slice(None, 1, None)``
* val of ``"1:2:3"`` results in ``slice(1, 2, 3)``
"""
vals = [(int(v) if v else None) for v in val.split(":")]
return slice(*vals)
def parse_index(
reader: Reader,
start: str = None) -> ItemPathSpec:
"""
Parse an index portion of an `ItemPath` from the reader
"""
if not start:
start = reader.read(1)
if not start:
msg = "Unterminated item index, missing closing ']'"
raise ParserError(msg)
elif start != '[':
msg = f"Unknown item index start: {start!r}"
raise ParserError(msg)
val = list(parse_exprs(reader, '[', ']'))
lval = len(val)
if lval == 0:
return AllItems()
elif lval == 1:
sval: str = val[0]
if _slice_like == sval:
return convert_slice(sval)
else:
return sval
else:
msg = f"Too many arguments in item index: {val!r}"
raise ParserError(msg)
QuotedSpec = Union[Glob, Regex, str]
def parse_quoted(
reader: Reader,
quotec: str = None,
advanced_escapes: bool = True) -> QuotedSpec:
"""
Helper function for `parse_exprs`, will parse quoted values and
return the appropriate wrapper type depending on the quoting
character.
* ``"foo"`` is a `str`
* ``/foo/`` is a `Regex`
* ``|foo|`` is a `Glob`
Symbols are generated in the parse_exprs function directly, as
they are not quoted.
It is expected that the first quoting character will have been
read already from src prior to this function being invoked. If
that is not the case, and the first quoting character is still in
the src iterable, then a quotec of None can be used to indicate
that it should be taken from the first character of the src.
:param reader: source to read from
:param quotec: initiating quote character, or None if the first
character should be read from the reader
:param advanced_escapes: if True then the escaped character will
be parsed for character escape sequences which will be replaced
with their relevant unicode value. if False then escaped
character will simply be inlined into the value
"""
if not quotec:
quotec = reader.read(1)
if not quotec:
msg = f"Unterminated matcher: missing closing {quotec!r}"
raise ParserError(msg)
token = StringIO()
esc: str = None
srciter = iter(partial(reader.read, 1), '')
for c in srciter:
if esc:
if advanced_escapes and c != quotec:
token.write(esc)
token.write(c)
esc = None
elif c == quotec:
break
elif c == '\\':
esc = c
else:
token.write(c)
else:
msg = f"Unterminated matcher: missing closing {quotec!r}"
raise ParserError(msg)
val = token.getvalue()
if advanced_escapes:
val = convert_escapes(val)
if quotec == "/":
flags = []
# hard-coding the flags we support for regex
while reader.peek(1) in "aiLmsux":
flags.append(reader.read(1))
return Regex(val, "".join(flags))
elif quotec == "|":
iflag = False
# hard-coding that we only support a single flag for glob
if reader.peek(1) == 'i':
reader.read(1)
iflag = True
return Glob(val, ignorecase=iflag)
else:
# plain ol' string
return val
#
# The end.