Source code for pylablib.core.fileio.dict_entry

"""
Classes for dealing with the :class:`.Dictionary` entries with special conversion rules when saved or loaded.
Used to redefine how certain objects (e.g., tables) inside dictionaries are written into files and read from files.
"""


from ..utils import dictionary, py3
from . import location, parse_csv
from .loadfile_utils import InlineTable

import numpy as np
import pandas as pd



[docs] def is_dict_entry_branch(branch): """ Check if the dictionary branch contains a dictionary entry which needs to be specially converted. """ try: return "__data_type__" in branch except TypeError: return False
def _is_data_valid(data, pred): if pred is None: return None elif isinstance(pred,(type,tuple)): return isinstance(data,pred) else: return pred(data) def _is_branch_valid(branch, pred): bdt=branch["__data_type__"] if pred is None: return None elif isinstance(pred,tuple): return bdt in pred elif isinstance(pred,py3.textstring): return bdt in pred else: return pred(bdt) ##### Entry builders and parsers #####
[docs] class DictEntryBuilder: """ Object for building dictionary entries from objects. Args: entry_cls: dictionary entry class pred: method used to check if an object can be turned into the corresponding entry; if ``None``, use the default entry class checker (``entry_class.is_data_valid``) kwargs: keyword arguments passed to the entry constructor along with the data """ def __init__(self, entry_cls, pred=None, **kwargs): self.entry_cls=entry_cls self.pred=pred self.kwargs=kwargs
[docs] def is_data_valid(self, data): """Check if a data object can be wrapped by the current entry class""" if self.pred: return _is_data_valid(data,self.pred) else: return self.entry_cls.is_data_valid(data)
[docs] def from_data(self, data): """Build a dictionary entry from the data""" return self.entry_cls(data,**self.kwargs)
[docs] class DictEntryParser: """ Object for building dictionary entries from dictionary branches. Args: entry_cls: dictionary entry class pred: method used to check if a dictionary branch can be turned into the corresponding entry; if ``None``, use the default entry class checker (``entry_class.is_branch_valid``) kwargs: keyword arguments passed to the entry ``from_dict`` class method along with the branch """ def __init__(self, entry_cls, pred=None, **kwargs): self.entry_cls=entry_cls self.pred=pred self.kwargs=kwargs
[docs] def is_branch_valid(self, branch): """Check if a branch can be parsed by the current entry class""" if self.pred: return _is_branch_valid(branch,self.pred) else: return self.entry_cls.is_branch_valid(branch)
[docs] def from_dict(self, dict_ptr, loc): """Build a dictionary entry from the branch and the file location""" return self.entry_cls.from_dict(dict_ptr,loc,**self.kwargs)
_default_builders=[] _default_parsers=[]
[docs] def add_dict_entry_builder(builder): """Add an entry builder to the global list of builders""" _default_builders.append(builder)
[docs] def add_dict_entry_parser(parser): """Add an entry parser to the global list of parsers""" _default_parsers.append(parser)
[docs] def add_dict_entry_class(cls): """ Add an entry class. Automatically registers builder and parser, which take no additional arguments and use default class method to determine if an object/branch can be converted into an entry. """ add_dict_entry_builder(DictEntryBuilder(cls)) add_dict_entry_parser(DictEntryParser(cls))
[docs] def from_data(data, builders=None): """ Build a dictionary entry from the data. `builders` can contain an additional list of builder to try before using the default ones. """ if isinstance(data, IDictionaryEntry): return data builders=(builders or [])+_default_builders for b in builders: if b.is_data_valid(data): return b.from_data(data) return None
[docs] def from_dict(dict_ptr, loc, parsers=None): """ Build a dictionary entry from the dictionary branch and the file location. `parsers` can contain an additional list of parsers to try before using the default ones. """ parsers=(parsers or [])+_default_parsers for p in parsers: if p.is_branch_valid(dict_ptr): return p.from_dict(dict_ptr,loc) return None
### General description ###
[docs] class IDictionaryEntry: """ A generic `Dictionary` entry. Contains data represented by the node, as well as the way to represent this data as a dictionary branch. Args: data: data to be wrapped """ _data_type=None # data type marker (a string marker of the entry class which is saved in the dictionary under ``__data_type__```) _obj_type=None def __init__(self, data): self.data=data
[docs] @classmethod def is_data_valid(cls, data): """Check if a data object can be wrapped by the current entry class""" return _is_data_valid(data,cls._obj_type)
[docs] @classmethod def is_branch_valid(cls, branch): """Check if a branch can be parsed by the current entry class""" return _is_branch_valid(branch,cls._data_type)
[docs] @classmethod def from_dict(cls, dict_ptr, loc): # pylint: disable=unused-argument """ Convert a dictionary branch to a specific :class:`IDictionaryEntry` object. Args: dict_ptr (dictionary.DictionaryPointer): Pointer to the dictionary location for the entry. loc: Location for the data to be loaded. """ return cls(None) if cls.is_branch_valid(dict_ptr) else None
[docs] def to_dict(self, dict_ptr, loc): # pylint: disable=unused-argument """ Convert data to a dictionary branch on saving. Args: dict_ptr (dictionary.DictionaryPointer): Pointer to the dictionary location for the entry. loc: File location for the data to be saved. """ return dictionary.Dictionary({"__data_type__":self._data_type})
### Table formatters ###
[docs] def parse_stored_table_data(desc=None, data=None, out_type="pandas"): """ Parse table data corresponding to the given description dictionary and data. Args: desc: description dictionary; can be ``None``, if no description is given data: separately loaded data; can be ``None``, if no data is given (in this case assume that it is stored in the description dictionary); can be a tuple ``(column_data, column_names)`` (such as the one returned by :func:`.parse_csv.read_table`), or a an :class:`.InlineTable` object containing such tuple. out_type (str): Output format of the data (``'array'`` for numpy arrays or ``'pandas'`` for pandas DataFrame objects). Return: tuple ``(data, columns)``, where ``data`` is the data table in the specified format, and ``columns`` is the list of columns """ desc=desc or {} data=desc.get("data",data) if data is None: raise ValueError("can't load {0} with format {1}".format(desc,"inline")) if isinstance(data,InlineTable): data=data.table data,columns=data columns=desc.get("columns",columns) out_type=desc.get("__cont_type__",out_type) if out_type in {"datatable","table"}: # legacy file formats out_type="pandas" if len(data)==0: data=parse_csv.columns_to_table([],columns=columns,out_type=out_type) if out_type=="pandas": data=pd.DataFrame(dict(zip(columns,data)),columns=columns) if "index_columns" in desc: index_width=len(desc["index_columns"]) data=data.set_index(columns[:index_width]) data.index.names=desc["index_columns"] columns=columns[index_width:] if desc.get("column_multiindex",False): data.columns=pd.MultiIndex.from_tuples(columns) else: if columns and len(columns)!=len(data): raise ValueError("columns number doesn't agree with the table size") data=np.column_stack(data) return data,columns
[docs] class ITableDictionaryEntry(IDictionaryEntry): """ A generic table Dictionary entry. Args: data: Table data. columns (list): If not ``None``, list of column names (if ``None`` and data is a pandas DataFrame object, get column names from that). """ _data_type="table" # data type marker (a string marker of the entry class which is saved in the dictionary under ``__data_type__```) def __init__(self, data, columns=None): IDictionaryEntry.__init__(self,data) self.columns=columns
[docs] @classmethod def is_data_valid(cls, data): """Check if a data object can be wrapped by the current entry class""" return isinstance(data,pd.DataFrame) or (isinstance(data,np.ndarray) and data.ndim==2)
def _prepare_desc_data(self): data=self.data desc=dictionary.Dictionary() desc["__data_type__"]="table" columns=self.columns if isinstance(data,pd.DataFrame): desc["__cont_type__"]="pandas" desc["column_multiindex"]=False if columns is None: columns=data.columns.tolist() desc["column_multiindex"]=data.columns.nlevels>1 elif len(data.columns)!=len(columns): raise ValueError("supplied columns length {} doesn't agree with the data columns length {}".format(len(columns),len(data.columns))) default_idx=data.index.equals(pd.RangeIndex(stop=len(data))) if not default_idx: desc["index_columns"]=list(data.index.names) data=data.reset_index() # allow index/column name clashes to raise errors columns=data.columns.tolist() if desc["column_multiindex"]: data.columns=columns else: desc["__cont_type__"]="array" data=np.asarray(data) if columns is not None: desc["columns"]=columns return desc, data
[docs] @classmethod def from_dict(cls, dict_ptr, loc, out_type="pandas"): # pylint: disable=arguments-differ """ Convert a dictionary branch to a specific DictionaryEntry object. Args: dict_ptr (dictionary.DictionaryPointer): Pointer to the dictionary location for the entry. loc: Location for the data to be loaded. out_type (str): Output format of the data (``'array'`` for numpy arrays or ``'pandas'`` for pandas DataFrame objects), used only if the dictionary doesn't provide the format. """ table_type=dict_ptr.get("__table_type__",None) if table_type is None: if "data" in dict_ptr: table_type="inline" elif "file_path" in dict_ptr: table_type="external" else: raise ValueError("unrecognized table format: {0}".format(dict_ptr)) if table_type=="inline": return InlineTableDictionaryEntry.from_dict(dict_ptr,loc,out_type=out_type) else: return IExternalTableDictionaryEntry.from_dict(dict_ptr,loc,out_type=out_type)
add_dict_entry_class(ITableDictionaryEntry)
[docs] class InlineTableDictionaryEntry(ITableDictionaryEntry): """ An inlined table Dictionary entry. Args: data: Table data. columns (list): If not ``None``, a list of column names (if ``None`` and data is a pandas DataFrame object, get column names from that). """
[docs] def to_dict(self, dict_ptr, loc): """ Convert the data to a dictionary branch and write the table to the file. """ if self.data is None: raise ValueError("can't build entry for empty table") d,table=self._prepare_desc_data() d["__table_type__"]="inline" d["data"]=InlineTable(table) return d
[docs] @classmethod def from_dict(cls, dict_ptr, loc, out_type="pandas"): """ Build an :class:`InlineTableDictionaryEntry` object from the dictionary and read the inlined data. Args: dict_ptr (dictionary.DictionaryPointer): Pointer to the dictionary location for the entry. loc: Location for the data to be loaded. out_type (str): Output format of the data (``'array'`` for numpy arrays or ``'pandas'`` for pandas DataFrame objects). """ data,columns=parse_stored_table_data(dict_ptr,out_type=out_type) return InlineTableDictionaryEntry(data,columns)
[docs] class IExternalTableDictionaryEntry(ITableDictionaryEntry): def __init__(self, data, file_format, name, columns, force_name=True): from . import savefile data,file_format=savefile.get_output_format(data,file_format) ITableDictionaryEntry.__init__(self,data,columns) self.file_format=file_format self.name=location.LocationName.from_object(name) self.force_name=force_name def _get_name(self, dict_ptr, loc): name=self.name if name.get_path()=="": name=location.LocationName("_".join(dict_ptr.get_path()),name.ext) if not self.force_name: name=loc.generate_new_name(name,idx=None) return name
[docs] @classmethod def from_dict(cls, dict_ptr, loc, out_type="pandas"): file_type=dict_ptr.get("file_type",None) if not (file_type in {"bin","csv"}): # TODO: add autodetect raise ValueError("can't load {0} with format {1}".format(dict_ptr,"external")) if file_type=="csv": return ExternalTextTableDictionaryEntry.from_dict(dict_ptr,loc,out_type=out_type) else: return ExternalBinTableDictionaryEntry.from_dict (dict_ptr,loc,out_type=out_type)
[docs] class ExternalTextTableDictionaryEntry(IExternalTableDictionaryEntry): """ An external text table Dictionary entry. Args: data: Table data. file_format (str): Output file format. name (str): Name template for the external file (default is the full path connected with ``"_"`` symbol). columns (list): If not ``None``, a list of column names (if ``None`` and data is a pandas DataFrame object, get column names from that). force_name (bool): If ``False`` and the target file already exists, generate a new unique name; otherwise, overwrite the file. """ def __init__(self, data=None, file_format="csv", name="", columns=None, force_name=True): IExternalTableDictionaryEntry.__init__(self,data,file_format,name,columns,force_name=force_name)
[docs] def to_dict(self, dict_ptr, loc): """ Convert the data to a dictionary branch and save the table to an external file. """ if self.data is None: raise ValueError("can't build entry for empty table") d,table=self._prepare_desc_data() name=self._get_name(dict_ptr,loc) d["__table_type__"]="external" d["file_type"]=self.file_format.format_name save_file=location.LocationFile(loc,name) self.file_format.write(save_file,table) d["file_path"]=save_file.name.to_string() return d
[docs] @classmethod def from_dict(cls, dict_ptr, loc, out_type="pandas"): """ Build an :class:`ExternalTextTableDictionaryEntry` object from the dictionary and load the external data. Args: dict_ptr (dictionary.DictionaryPointer): Pointer to the dictionary location for the entry. loc: Location for the data to be loaded. out_type (str): Output format of the data (``'array'`` for numpy arrays or ``'pandas'`` for pandas DataFrame objects). """ from . import loadfile file_path=dict_ptr["file_path"] file_type=dict_ptr.get("file_type","csv") out_type=dict_ptr.get("__cont_type__",out_type) load_file=location.LocationFile(loc,file_path) file_out_type="array" if out_type=="array" else "columns" data=loadfile.build_file_format(load_file,file_format=file_type,out_type=file_out_type,dtype="generic").read(load_file).data data,_=parse_stored_table_data(dict_ptr,data=data,out_type=out_type) return ExternalTextTableDictionaryEntry(data,name=load_file.name)
[docs] class ExternalBinTableDictionaryEntry(IExternalTableDictionaryEntry): """ An external binary table Dictionary entry. Args: data: Table data. file_format (str): Output file format. name (str): Name template for the external file (default is the full path connected with ``"_"`` symbol). columns (list): If not ``None``, a list of column names (if ``None`` and data is a pandas DataFrame object, get column names from that). force_name (bool): If ``False`` and the target file already exists, generate a new unique name; otherwise, overwrite the file. """ def __init__(self, data=None, file_format="bin", name="", columns=None, force_name=True): IExternalTableDictionaryEntry.__init__(self,data,file_format,name,columns,force_name=force_name)
[docs] def to_dict(self, dict_ptr, loc): """ Convert the data to a dictionary branch and save the table to an external file. """ if self.data is None: raise ValueError("can't build entry for empty table") d,table=self._prepare_desc_data() name=self._get_name(dict_ptr,loc) d["__table_type__"]="external" d["file_type"]=self.file_format.format_name save_file=location.LocationFile(loc,name) self.file_format.write(save_file,table) d.merge(self.file_format.get_preamble(save_file,table),"preamble") d["file_path"]=save_file.name.to_string() return d
[docs] @classmethod def from_dict(cls, dict_ptr, loc, out_type="pandas"): """ Build an :class:`ExternalBinTableDictionaryEntry` object from the dictionary and load the external data. Args: dict_ptr (dictionary.DictionaryPointer): Pointer to the dictionary location for the entry. loc: Location for the data to be loaded. out_type (str): Output format of the data (``'array'`` for numpy arrays or ``'pandas'`` for pandas DataFrame objects). """ from . import loadfile file_path=dict_ptr["file_path"] file_type=dict_ptr.get("file_type","bin") preamble=dict_ptr.get("preamble",None) out_type=dict_ptr.get("__cont_type__",out_type) load_file=location.LocationFile(loc,file_path) file_out_type="array" if out_type=="array" else "columns" data=loadfile.build_file_format(load_file,file_format=file_type,preamble=preamble,out_type=file_out_type).read(load_file).data if out_type=="pandas" and len(data[0]): data=[(c.astype(c.dtype.type,copy=False) if isinstance(c,np.ndarray) else c) for c in data[0]],data[1] # convert data to native byteorder (required for pandas indexing) data,_=parse_stored_table_data(dict_ptr,data=data,out_type=out_type) return ExternalBinTableDictionaryEntry(data,name=load_file.name)
[docs] def table_entry_builder(table_format="inline"): """ Make an entry builder for tables depending on the table format. Args: table_format (str): Default format for table (numpy arrays or pandas DataFrames) entries. Can be ``'inline'`` (table is written inside the file), ``'csv'`` (external CSV file) or ``'bin'`` (external binary file). """ if table_format=="inline": return DictEntryBuilder(InlineTableDictionaryEntry) elif table_format=="csv": return DictEntryBuilder(ExternalTextTableDictionaryEntry) elif table_format=="bin": return DictEntryBuilder(ExternalBinTableDictionaryEntry) else: raise ValueError("unrecognized table format: {}".format(table_format))
[docs] class IExternalFileDictionaryEntry(IDictionaryEntry): """ Generic dictionary entry for data in an external file. Args: data: Stored data. name (str): Name template for the external file (default is the full path connected with ``"_"`` symbol). force_name (bool): If ``False`` and the target file already exists, generate a new unique name; otherwise, overwrite the file. """ _data_type="external_file" # data type marker (a string marker of the entry class which is saved in the dictionary under ``__data_type__```) def __init__(self, data, name="", force_name=True): IDictionaryEntry.__init__(self,data) self.name=location.LocationName.from_object(name) self.force_name=force_name file_format=None _file_formats={}
[docs] @staticmethod def add_file_format(subclass): """ Register an :class:`IExternalFileDictionaryEntry` as a possible stored file format. Used to automatically invoke a correct loader when loading the dictionary file. Only needs to be done once after the subclass declaration. """ IExternalFileDictionaryEntry._file_formats[subclass.file_format]=subclass
def _get_name(self, dict_ptr, loc): name=self.name if name.get_path()=="": name=location.LocationName("_".join(dict_ptr.get_path()),name.ext) if not self.force_name: name=loc.generate_new_name(name,idx=None) return name
[docs] def to_dict(self, dict_ptr, loc): """Convert the data to a dictionary branch and save the data to an external file""" name=self._get_name(dict_ptr,loc) d=dictionary.Dictionary() d["__data_type__"]="external_file" d["file_type"]=self.file_format save_file=location.LocationFile(loc,name) self.save_file(save_file) d.merge(self.get_preamble(),"preamble") d["file_path"]=save_file.name.to_string() return d
[docs] @classmethod def from_dict(cls, dict_ptr, loc): """ Build an :class:`IExternalFileDictionaryEntry` object from the dictionary and load the external data. Args: dict_ptr (dictionary.DictionaryPointer): Pointer to the dictionary location for the entry. loc: Location for the data to be loaded. """ file_path=dict_ptr["file_path"] file_type=dict_ptr.get("file_type",None) preamble=dict_ptr.get("preamble",{}) load_file=location.LocationFile(loc,file_path) try: subclass=cls._file_formats[file_type] except KeyError: raise ValueError("unrecognized file type: {}".format(file_type)) data=subclass.load_file(load_file,preamble) return subclass(data,name=load_file.name)
[docs] def get_preamble(self): """Generate preamble (dictionary with supplementary data which allows to load the data from the file)""" return {}
[docs] def save_file(self, location_file): """ Save stored data into the given location. Virtual method, should be overloaded in subclasses """ raise NotImplementedError("IExternalFileDictionaryEntry.save_file")
[docs] @classmethod def load_file(cls, location_file, preamble): """ Load stored data from the given location, using the supplied preamble. Virtual method, should be overloaded in subclasses """ raise NotImplementedError("IExternalFileDictionaryEntry.load_file")
add_dict_entry_class(IExternalFileDictionaryEntry)
[docs] class ExternalNumpyDictionaryEntry(IExternalFileDictionaryEntry): """ A dictionary entry which stores the numpy array data into an external file in binary format. Args: data: Numpy array data. name (str): Name template for the external file (default is the full path connected with ``"_"`` symbol). force_name (bool): If ``False`` and the target file already exists, generate a new unique name; otherwise, overwrite the file. dtype: numpy dtype to load/save the data (by default, dtype of the supplied data). """ def __init__(self, data, name="", force_name=True, dtype=None): IExternalFileDictionaryEntry.__init__(self,np.asarray(data,dtype=dtype),name=name,force_name=force_name) file_format="numpy"
[docs] def get_preamble(self): """Generate preamble (dictionary with supplementary data which allows to load the data from the file)""" return {"shape":self.data.shape,"dtype":self.data.dtype.str}
[docs] def save_file(self, location_file): """Save stored data into the given location""" with location_file.open("wb") as stream: self.data.tofile(stream)
[docs] @classmethod def load_file(cls, location_file, preamble): """Load stored data from the given location, using the supplied preamble""" with location_file.open("rb") as stream: return np.fromfile(stream,dtype=preamble["dtype"]).reshape(preamble["shape"])
IExternalFileDictionaryEntry.add_file_format(ExternalNumpyDictionaryEntry)
[docs] class ExpandedContainerDictionaryEntry(IDictionaryEntry): """ A dictionary entry which expands containers (lists, tuples, dictionaries) into subdictionaries. Useful when the data in the containers is complex, so writing it into one line (as is default for lists and tuples) wouldn't work. Args: data: Container data. """ _data_type="exp_container" # data type marker (a string marker of the entry class which is saved in the dictionary under ``__data_type__```)
[docs] def to_dict(self, dict_ptr, loc): """Convert the stored container to a dictionary branch""" if isinstance(self.data,list): clabel="list" elif isinstance(self.data,tuple): clabel="tuple" elif isinstance(self.data,dict): clabel="dict" else: raise ValueError("unrecognized container type of {}".format(self.data)) d=dictionary.Dictionary() d["__data_type__"]="exp_container" d["container_type"]=clabel if isinstance(self.data,dict): ldata=[{"k":k,"v":v} for (k,v) in self.data.items()] else: ldata=self.data for i,v in enumerate(ldata): d[i]=v return d
[docs] @classmethod def from_dict(cls, dict_ptr, loc): """Build an :class:`ExpandedContainerDictionaryEntry` object from the dictionary""" clabel=dict_ptr["container_type"] if clabel in ["list","tuple"]: value=[dict_ptr[k] for k in range(len(dict_ptr)-2)] if clabel=="tuple": value=tuple(value) else: value={dict_ptr[k,"k"]:dict_ptr[k,"v"] for k in range(len(dict_ptr)-2)} return cls(value)
add_dict_entry_class(ExpandedContainerDictionaryEntry)