Source code for ClearMap.Utils.TagExpression

# -*- coding: utf-8 -*-
"""
TagExpression
=============

Module providing routines to check and convert between tag expressions.
This simplfies the handling of list of files from using regular expressions
to tag expression. 

A tag is a simple placeholder for a string or number in a file name.
An expression is a filename with any number of tags.

A tag has the format <Name,Type,Width>. 
  * Name : str that specifies the tag name
  * Type : 'I' or 'S' for integer or string, optional and defaults to 'I'
  * Width : int, if given indicates a fixed width  and missing digist or chars
    are replaced by trailing zeros or spaces.

The module also provides functions to infer the tag expressions from a list of
file names via the function :func:`~ClearMap.Utils.TagExpression.detect`.

Expressions can also be converted to glob or regular expressions.


Example
-------
An example tag expression would be 'file_<X,2>_<Y,3>.npy' and would for example
match the filename 'file_01_013.npy'

>>> import ClearMap.Utils.TagExpression as te
>>> e = te.Expression('file_<X,2>_<Y,3>.npy')
>>> e.tag_names()
e.tag_names()

>>> e.glob()
'file_[0-9][0-9]_[0-9][0-9].npy'

>>> e.indices('file_01_013.npy')
[1, 13]

>>> e.re()
'file_(?P<X>\\d{2})_(?P<Y>\\d{3})\\.npy'

>>> e.string({'X':1, 'Y':10})
'file_01_010.npy'

See also
--------
:mod:`~ClearMap.Utils.RegularExpression`
"""
__author__    = 'Christoph Kirst <christoph.kirst.ck@gmail.com>'
__license__   = 'GPLv3 - GNU General Pulic License v3 (see LICENSE.txt)'
__copyright__ = 'Copyright © 2020 by Christoph Kirst'
__webpage__   = 'http://idisco.info'
__download__  = 'http://www.github.com/ChristophKirst/ClearMap2'


import copy

import re

TAG_START = '<';
TAG_END   = '>';
TAG_SEPARATOR = ',';
TAG_INT = 'I'; 
TAG_STR = 'S';

[docs]def ttype_to_dtype(ttype): if ttype == TAG_INT or ttype is None: return int elif ttype == TAG_STR: return str; else: raise ValueError('The specified tag type %r is not valid!' % ttype);
[docs]def default_tag_name(index = None): tag = 'Tag'; if index is not None: tag += '%d' % index; return tag;
[docs]class Tag(object): def __init__(self, tag = None, name = None, ttype = TAG_INT, width = None, reference = None, trange = None): if tag is not None: self.parse(tag); else: self.name = name; self.ttype = ttype; self.width = width; self.reference = reference; self.trange = trange;
[docs] def label(self, index = None): if self.name is not None: return self.name; else: return default_tag_name(index = index)
[docs] def dtype(self): return ttype_to_dtype(self.ttype);
[docs] def tag(self): t = TAG_START; if self.name is not None: t += self.name + TAG_SEPARATOR; if self.ttype is not None: t += self.ttype + TAG_SEPARATOR; if self.width is not None: t += str(self.width) + TAG_SEPARATOR; if len(t) > len(TAG_START): t = t[:-len(TAG_SEPARATOR)]; t += TAG_END; return t;
[docs] def glob(self): e = ''; if self.width != None: if self.ttype == TAG_INT or self.ttype is None: s = '[0-9]' elif self.ttype == TAG_STR: s = '?' e += ''.join([s] * self.width); else: e += '*'; return e;
[docs] def re(self, name = None): if name is None: name = self.name; e = ''; if self.reference: if name is not None: e += '(?P=' + name + ')'; else: raise ValueError('Name needs to be given for a referenced tag!'); else: if name is not None: e += '(?P<' + name + '>'; else: e += '(' if self.ttype == TAG_INT or self.ttype is None: e += '\d'; else: # self.ttype == TAG_STR: e += '.'; if self.width is not None: e += '{' + str(self.width) + '}'; else: e += '*?'; e += ')' return e;
[docs] def string(self, value = None): if value is None: return self.tag(); if self.width is None: if self.ttype == TAG_INT or self.ttype is None: frmt = '%d'; else: frmt = '%s'; else: if self.ttype == TAG_INT or self.ttype is None: frmt = '%0' + str(self.width) + 'd'; else: frmt = '%' + str(self.width) + 's'; return frmt % value;
[docs] def string_from_index(self, index = None): if index is not None: if self.trange is not None: value = self.trange[index]; else: value = index; else: value = None; return self.string(value=value);
[docs] def value(self, string): return self.dtype()(string);
[docs] def index(self, value): if self.trange is None: if isinstance(value, str): raise IndexError('No range to determine index for tag %r and value %r!' % (self, value)); else: return value; else: for i,r in enumerate(self.trange): if value == r: return i; raise IndexError('Value %r not in tag range %r!' % (value, self.trange));
[docs] def parse(self, tag): if len(tag) < len(TAG_START) + len(TAG_END): raise ValueError('The string %s is not a valid tag!' % tag) if tag[:len(TAG_START)] != TAG_START: raise ValueError('Expecting the tag to start with %s found %s!' % (TAG_START, tag[:len(TAG_START)])); if tag[-len(TAG_END):] != TAG_END: raise ValueError('Expecting the tag to end with %s found %s!' % (TAG_END, tag[-len(TAG_END):])); tag = tag[len(TAG_START):-len(TAG_END)]; if len(tag) == 0: self.__init__(); return; values = tag.split(TAG_SEPARATOR); if len(values) > 3: raise ValueError('Found %d > %d tag specifications!' % (len(values), 3)); name = []; ttype = []; width = []; for i,v in enumerate(values): try: if i == 0: #dont expect a width specification in the first entry raise Exception; width.append(int(v)); except: if v in [TAG_INT, TAG_STR]: ttype.append(v); elif len(v) == 0: raise ValueError('Found two tag separators without a value in between!'); else: name.append(v); if len(name) > 1: raise ValueError('More than one name found in tag: %r' % name); if len(ttype) > 1: raise ValueError('More than one type found in tag: %r' % ttype); if len(width) > 1: raise ValueError('More than one widht found in tag: %r' % width); name = name[0] if len(name) == 1 else None; ttype = ttype[0] if len(ttype) == 1 else None; width = width[0] if len(width) == 1 else None; self.__init__(name=name, ttype=ttype, width=width);
def __str__(self): return self.tag(); def __repr__(self): return self.__str__();
[docs]class Expression(object): def __init__(self, pattern = None): if isinstance(pattern, Expression): self.pattern = copy.copy(pattern.pattern); self.tags = copy.copy(pattern.tags); elif isinstance(pattern, str): self.parse(pattern); else: if pattern is None: pattern = []; self.pattern = pattern; self.tags = [p for p in pattern if isinstance(p, Tag) and not p.reference];
[docs] def tag(self): e = ''; for p in self.pattern: if isinstance(p, Tag): e += p.tag(); else: e += p; return e;
[docs] def re(self): e = ''; n_tag = 0; for p in self.pattern: if isinstance(p, Tag): e += p.re(name = p.label(n_tag)); n_tag +=1; else: e += re.escape(p); return e;
[docs] def glob(self, values = None): e = ''; n_tag = 0; for p in self.pattern: if isinstance(p, Tag): if values is None: e += p.glob(); else: lab = p.label(n_tag); if lab in values.keys(): e += escape_glob(p.string(value = values[lab])); else: e += p.glob(); n_tag += 1; else: e += escape_glob(p); return e;
[docs] def string(self, values = None): e = ''; n_tag = 0; for p in self.pattern: if isinstance(p, Tag): if values is not None: v = values.get(p.label(n_tag), None); else: v = None; e += p.string(value = v); n_tag += 1; else: e += p; return e;
[docs] def values(self, string): tags = self.tags; search = re.compile(self.re()).search; match = search(string); if match is None: return {}; else: d = match.groupdict(); for k,v in d.items(): for i,t in enumerate(tags): if k == t.label(i): v = t.dtype()(v); d[k] = v; break; return d;
[docs] def string_from_index(self, indices): if not isinstance(indices, dict): indices = {t.label(i) : indices[i] for i,t in enumerate(self.tags)}; e = ''; n_tag = 0; for p in self.pattern: if isinstance(p, Tag): e += p.string_from_index(index = indices[p.label(n_tag)]); n_tag += 1; else: e += p; return e;
[docs] def indices(self, string): tags = self.tags; search = re.compile(self.re()).search; match = search(string); if match is None: raise ValueError('Cannot infer indices from string!') else: d = match.groupdict(); for k,v in d.items(): for i,t in enumerate(tags): if k == t.label(i): v = t.index(t.dtype()(v)); d[k] = v; break; indices = [d[t.label(i)] for t in tags]; return indices;
[docs] def tag_names(self): return [t.label(i) for i,t in enumerate(self.tags)];
[docs] def ntags(self): return len(self.tags);
def __getitem__(self, i): if isinstance(i, int): return self.tags[i]; else: for j,n in enumerate(self.tag_names()): if n == i: return self.tags[j]; raise IndexError('No tag with name %r!' % i);
[docs] def parse(self, expression): p = re.compile(TAG_START + '.*?' + TAG_END); pattern = []; tags = []; start = 0; for match in p.finditer(expression): if match.start() > start: pattern.append(expression[start:match.start()]); tag = Tag(tag = match.group()) pattern.append(tag); tags.append(tag); start = match.end(); if start < len(expression): pattern.append(expression[start:]); #check for references for i,t in enumerate(tags): if t.name is not None and t.reference is not True: refs = [r for r in tags[i+1:] if r.name == t.name]; for r in refs: r.reference = True; if r.ttype is None: r.ttype = t.ttype; elif r.ttype != t.ttype: raise ValueError('The reference %r has not the same type as the tag %r!' % (r,t)); if r.width is None: r.width = t.width; self.pattern = pattern; self.tags = [q for q in pattern if isinstance(q, Tag) and not q.reference];
[docs] def detect(self, strings, names = None, max_check = None, with_trange = False): if not isinstance(strings,list): strings = [strings]; ls = [len(s) for s in strings]; for l in ls: if l != ls[0]: raise ValueError('Cannot infer tag expression from strings of different length!'); if max_check is None: max_check = len(strings); if names is None: names = []; #detect differences in filenames s0 = strings[0]; tags = []; tag_start = -1; tag_end = -1; for i, c in enumerate(s0): same = True; for s in strings[1:]: if s[i] != c: if i == tag_end: tag_end += 1; else: tag_start = i; tag_end = i + 1; break; if same and tag_start != -1: tags.append((tag_start, tag_end)); tag_start = -1; tag_end = -1; #detect trailing zeros tags_full = []; for t in tags: s,e = t; while s > 0 and s0[s-1] == '0': s -= 1; tags_full.append((s,e)); tags = tags_full; #infer pattern pattern = []; p = 0; for i,t in enumerate(tags): s,e = t; if s-p > 0: pattern.append(s0[p:s]); p = e; ttype = TAG_INT; values = []; for s in strings[:max_check]: v = s[t[0]:t[1]]; try: v = int(v); except: ttype = TAG_STR; if not with_trange: break; values.append(v); if len(names) > 0: name = names.pop(0); else: name = default_tag_name(i); trange = values if with_trange else None; pattern.append(Tag(name = name, ttype = ttype, width = t[1] - t[0], trange = trange)); if p < len(s0) > 0: pattern.append(s0[p:]); self.pattern = pattern; self.tags = [q for q in pattern if isinstance(q, Tag) and not q.reference];
def __str__(self): return 'TagExpression(' + self.tag() + ')'; def __repr__(self): return self.__str__();
[docs]def parse(expression): e = Expression() e.parse(expression=expression); return e;
[docs]def detect(strings, names = None, max_check = None, with_trange = False): e = Expression(); e.detect(strings=strings, names=names, max_check=max_check, with_trange=with_trange); return e;
[docs]def escape_glob(string): e = ''; for c in string: if c in '?[]': e += '[' + c + ']'; else: e += c; return e;
def _test(): """Tests""" import ClearMap.Utils.TagExpression as te #reload(te); #values and strings t = te.parse('/test/test<X,I,4>_<Y,I,3>_<X>.tif') s = '/test/test0010_013_0010.tif'; v = t.values(s); s2 = t.string(v) s == s2 t.string({'X' : 111}) #indices t.indices(s) t = te.parse('/test/test<X,I,4>_<Y,S>.tif'); t['X'] t['Y'].trange = list('abcd') t.string_from_index([0,2]) #glob import ClearMap.Tests.Files as tf import glob s = tf.io.join(tf.tif_sequence, 'sequence<I,4>.tif') t = te.parse(s) f = glob.glob(t.glob()) #detection te.detect(f, names = ['X']) t = te.detect(f, names = ['X'], with_trange = True); t['X'].trange