Source code for pylablib.core.fileio.loadfile

"""
Utilities for reading data files.
"""

from . import datafile, location, dict_entry, parse_csv, loadfile_utils
from ..utils import funcargparse, library_parameters

import numpy as np

library_parameters.library_parameters.update({"fileio/loadfile/csv/out_type":"pandas"},overwrite=False)
library_parameters.library_parameters.update({"fileio/loadfile/dict/inline_out_type":"pandas"},overwrite=False)


##### File formats #####


[docs]class IInputFileFormat: """ Generic class for an input file format. Based on `file_format` or autodetection, calls one of its subclasses to read the file. Defines a single static method """
[docs] @staticmethod def detect_file_format(location_file): with location_file.open("rb") as stream: is_binary=loadfile_utils.detect_binary_file(stream) if is_binary: return BinaryTableInputFileFormatter else: return ITextInputFileFormat.detect_file_format(location_file)
[docs] def read(self, location_file): """Read a file at a given location""" raise NotImplementedError("{}.{}".format(self.__class__.__name__,"read"))
[docs]class ITextInputFileFormat(IInputFileFormat): # pylint: disable=abstract-method """ Generic class for a text input file format. Based on `file_format` or autodetection, calls one of its subclasses to read the file. """
[docs] @staticmethod def detect_file_format(location_file): with location_file.open("r") as stream: txt_type=loadfile_utils.detect_textfile_type(stream) if txt_type=="table": return CSVTableInputFileFormat elif txt_type=="dict": return DictionaryInputFileFormat else: raise IOError("can't detect file type")
[docs]class CSVTableInputFileFormat(ITextInputFileFormat): """ Class for CSV input file format. Args: out_type (str): type of the result: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame, or ``'default'`` (determined by the library default; ``'pandas'`` by default) dtype: dtype of entries; can be either a single type, or a list of types (one per column). Possible dtypes are: ``'int'``, ``'float'``, ``'complex'``, ``'numeric'`` (tries to coerce to minimal possible numeric type, raises error if data can't be converted to `complex`), ``'generic'`` (accept arbitrary types, including lists, dictionaries, escaped strings, etc.), ``'raw'`` (keep raw string). columns: either a number if columns, or a list of columns names. delimiters (str): Regex string which recognizes entries delimiters (by default ``r"\\s*,\\s*|\\s+"``, i.e., commas and whitespaces). empty_entry_substitute: Substitute for empty table entries. If ``None``, all empty table entries are skipped. ignore_corrupted_lines (bool): If ``True``, skip corrupted (e.g., non-numeric for numeric dtype, or with too few entries) lines; otherwise, raise :exc:`ValueError`. skip_lines (int): Number of lines to skip from the beginning of the file. """ def __init__(self, out_type="default", dtype="numeric", columns=None, delimiters=None, empty_entry_substitute=None, ignore_corrupted_lines=True, skip_lines=0): ITextInputFileFormat.__init__(self) self.out_type=library_parameters.library_parameters["fileio/loadfile/csv/out_type"] if out_type=="default" else out_type self.dtype=dtype self.columns=columns self.delimiters=delimiters or parse_csv._table_delimiters self.empty_entry_substitute=empty_entry_substitute self.ignore_corrupted_lines=ignore_corrupted_lines self.skip_lines=skip_lines
[docs] def read(self, location_file): with location_file.open("r") as stream: for _ in range(self.skip_lines): stream.readline() data,comments,corrupted=parse_csv.read_table(stream,dtype=self.dtype,columns=self.columns,out_type=self.out_type, delimiters=self.delimiters,empty_entry_substitute=self.empty_entry_substitute,ignore_corrupted_lines=self.ignore_corrupted_lines) if self.out_type in {"pandas"} and not funcargparse.is_sequence(self.columns,"builtin;nostring") and len(data)>0: columns,comment_idx=loadfile_utils.find_columns_lines(corrupted,comments,data.shape[1]) if comment_idx is not None: del comments[comment_idx] if columns is not None: data.columns=columns creation_time=loadfile_utils.find_savetime_comment(comments) return datafile.DataFile(data=data,comments=comments,creation_time=creation_time,filetype="csv")
[docs]class DictionaryInputFileFormat(ITextInputFileFormat): """ Class for Dictionary input file format. Args: location_file: Location of the data. case_normalization (str): If ``None``, the dictionary paths are case-sensitive; otherwise, defines the way the entries are normalized (``'lower'`` or ``'upper'``). inline_dtype (str): dtype for inlined tables. inline_out_type (str): type of the result of the inline table: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame, ``'raw'`` for raw :class:`.InlineTable` data containing tuple ``(column_data, column_names)``, or ``'default'`` (determined by the library default; ``'pandas'`` by default). entry_format (str): Determines the way for dealing with :class:`.dict_entry.IDictionaryEntry` objects (objects transformed into dictionary branches with special recognition rules). Can be ``'branch'`` (don't attempt to recognize those object, leave dictionary as in the file), ``'dict_entry'`` (recognize and leave as :class:`.dict_entry.IDictionaryEntry` objects) or ``'value'`` (recognize and keep the value). allow_duplicate_keys (bool): if ``False`` and the same key is mentioned twice in the file, raise and error skip_lines (int): Number of lines to skip from the beginning of the file. """ def __init__(self, case_normalization=None, inline_dtype="generic", inline_out_type="default", entry_format="value", allow_duplicate_keys=False, skip_lines=0): ITextInputFileFormat.__init__(self) self.case_normalization=case_normalization self.inline_dtype=inline_dtype self.inline_out_type=library_parameters.library_parameters["fileio/loadfile/dict/inline_out_type"] if inline_out_type=="default" else inline_out_type if not entry_format in {"branch","dict_entry","value"}: raise ValueError("unrecognized entry format: {0}".format(entry_format)) self.entry_format=entry_format self.allow_duplicate_keys=allow_duplicate_keys self.skip_lines=skip_lines
[docs] def read(self, location_file): with location_file.open("r") as stream: for _ in range(self.skip_lines): stream.readline() data,comments=loadfile_utils.read_dict_and_comments(stream,inline_dtype=self.inline_dtype, case_normalization=self.case_normalization,allow_duplicate_keys=self.allow_duplicate_keys) creation_time=loadfile_utils.find_savetime_comment(comments) def map_entries(ptr): if dict_entry.is_dict_entry_branch(ptr): entry=dict_entry.from_dict(ptr,location_file.loc) if self.entry_format=="value": entry=entry.data return entry else: return ptr if self.entry_format!="branch": data.map_self(map_entries,to_visit="branches",topdown=False) def map_inline_tables(ptr): if not dict_entry.is_dict_entry_branch(ptr): # check if there is an inline table not in the entry for k,v in ptr.items(): if isinstance(v,loadfile_utils.InlineTable): ptr[k],_=dict_entry.parse_stored_table_data(data=v,out_type=self.inline_out_type) return ptr if self.inline_out_type!="raw": data.map_self(map_inline_tables,to_visit="branches",topdown=False) if len(data)==1 and list(data.keys())==["__data__"]: # special case of files with preamble data=data["__data__"] return datafile.DataFile(data=data,comments=comments,creation_time=creation_time,filetype="dict")
[docs]class BinaryTableInputFileFormatter(IInputFileFormat): """ Class for binary input file format. Args: location_file: Location of the data. out_type (str): type of the result: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame, or ``'default'`` (determined by the library default; ``'pandas'`` by default) dtype: :class:`numpy.dtype` describing the data. columns: either number if columns, or a list of columns names. packing (str): The way the 2D array is packed. Can be either ``'flatten'`` (data is stored row-wise) or ``'transposed'`` (data is stored column-wise). preamble (dict): If not ``None``, defines binary file parameters that supersede the parameters supplied to the function. The defined parameters are ``'dtype'``, ``'packing'``, ``'ncols'`` (number of columns) and ``'nrows'`` (number of rows). skip_bytes (int): Number of bytes to skip from the beginning of the file. """ def __init__(self, out_type="default", dtype="<f8", columns=None, packing="flatten", preamble=None, skip_bytes=0): IInputFileFormat.__init__(self) self.out_type=library_parameters.library_parameters["fileio/loadfile/csv/out_type"] if out_type=="default" else out_type self.preamble=preamble or {} self.dtype=self.preamble.get("dtype",dtype) try: self.columns_num=len(columns) self.columns=columns except TypeError: self.columns_num=columns self.columns=None self.packing=self.preamble.get("packing",packing) self.skip_bytes=skip_bytes self.preamble_columns_num=self.preamble.get("ncols",None) if self.columns_num is None: self.columns_num=self.preamble_columns_num elif self.preamble_columns_num is not None and self.preamble_columns_num!=self.columns_num: raise ValueError("supplied columns number {0} disagrees with extracted form preamble {1}".format(self.columns_num,self.preamble_columns_num)) self.preamble_rows_num=self.preamble.get("nrows",None)
[docs] def read(self, location_file): with location_file.open("rb") as stream: if self.skip_bytes: stream.seek(self.skip_bytes,1) data=np.fromfile(stream,dtype=self.dtype) if self.columns_num is not None: if self.packing=="flatten": data=data.reshape((-1,self.columns_num)) elif self.packing=="transposed": data=data.reshape((self.columns_num,-1)).transposed() else: raise ValueError("unrecognized packing method: {0}".format(self.packing)) else: data=data[None,:] if self.preamble_rows_num is not None and len(data)!=self.preamble_rows_num: raise ValueError("supplied rows number {0} disagrees with extracted form preamble {1}".format(len(data),self.preamble_rows_num)) if self.out_type=="pandas": data=data.astype(data.dtype.type,copy=False) # convert to native byteorder (required for pandas indexing) data=parse_csv.columns_to_table([data[:,i] for i in range(data.shape[1])],columns=self.columns,out_type=self.out_type) return datafile.DataFile(data=data,filetype="bin")
[docs]def build_file_format(location_file, file_format="generic", **kwargs): """ Create file format (:class:`IInputFileFormat` instance) for given parameters and file locations. If ``file_format`` is already an instance of :class:`IInputFileFormat`, return unchanged. If ``file_format`` is generic (e.g., ``"generic"`` or ``"test"``), attempt to autodetect it from the file. ``**kwargs`` are passed to the file format constructor. """ if isinstance(file_format,IInputFileFormat): return file_format if file_format in {"generic",None}: return IInputFileFormat.detect_file_format(location_file)(**kwargs) elif file_format=="text": return ITextInputFileFormat.detect_file_format(location_file)(**kwargs) elif file_format=="csv": return CSVTableInputFileFormat(**kwargs) elif file_format=="dict": return DictionaryInputFileFormat(**kwargs) elif file_format=="bin": return BinaryTableInputFileFormatter(**kwargs) else: raise ValueError("unrecognized file format: {}".format(file_format))
[docs]def load_csv(path=None, out_type="default", dtype="numeric", columns=None, delimiters=None, empty_entry_substitute=None, ignore_corrupted_lines=True, skip_lines=0, loc="file", encoding=None, return_file=False): """ Load data table from a CSV/table file. Args: path (str): path to the file of a file-like object out_type (str): type of the result: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame, or ``'default'`` (determined by the library default; ``'pandas'`` by default) dtype: dtype of entries; can be either a single type, or a list of types (one per column). Possible dtypes are: ``'int'``, ``'float'``, ``'complex'``, ``'numeric'`` (tries to coerce to minimal possible numeric type, raises error if data can't be converted to `complex`), ``'generic'`` (accept arbitrary types, including lists, dictionaries, escaped strings, etc.), ``'raw'`` (keep raw string). columns: either a number if columns, or a list of columns names delimiters (str): regex string which recognizes entries delimiters (by default ``r"\\s*,\\s*|\\s+"``, i.e., commas and whitespaces) empty_entry_substitute: substitute for empty table entries. If ``None``, all empty table entries are skipped ignore_corrupted_lines (bool): if ``True``, skip corrupted (e.g., non-numeric for numeric dtype, or with too few entries) lines; otherwise, raise :exc:`ValueError` skip_lines (int): number of lines to skip from the beginning of the file loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details) encoding: if a new file location is opened, this specifies the encoding return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data """ location_file=location.LocationFile(location.get_location(path,loc,encoding=encoding)) file_format=CSVTableInputFileFormat(out_type=out_type,dtype=dtype,columns=columns,delimiters=delimiters, empty_entry_substitute=empty_entry_substitute,ignore_corrupted_lines=ignore_corrupted_lines,skip_lines=skip_lines) data_file=file_format.read(location_file) return data_file if return_file else data_file.data
[docs]def load_csv_desc(path=None, loc="file", encoding=None, return_file=False): """ Load data from the extended CSV table file. Analogous to :func:`load_dict`, but doesn't allow any additional parameters (which don't matter in this case). Args: path (str): path to the file of a file-like object loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details) encoding: if a new file location is opened, this specifies the encoding return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data """ return load_dict(path=path,loc=loc,encoding=encoding,return_file=return_file)
[docs]def load_bin(path=None, out_type="default", dtype="<f8", columns=None, packing="flatten", preamble=None, skip_bytes=0, loc="file", encoding=None, return_file=False): """ Load data from the binary file. Args: path (str): path to the file of a file-like object out_type (str): type of the result: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame, or ``'default'`` (determined by the library default; ``'pandas'`` by default) dtype: :class:`numpy.dtype` describing the data. columns: either number if columns, or a list of columns names. packing (str): The way the 2D array is packed. Can be either ``'flatten'`` (data is stored row-wise) or ``'transposed'`` (data is stored column-wise). preamble (dict): If not ``None``, defines binary file parameters that supersede the parameters supplied to the function. The defined parameters are ``'dtype'``, ``'packing'``, ``'ncols'`` (number of columns) and ``'nrows'`` (number of rows). skip_bytes (int): Number of bytes to skip from the beginning of the file. loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details) encoding: if a new file location is opened, this specifies the encoding return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data """ location_file=location.LocationFile(location.get_location(path,loc,encoding=encoding)) file_format=BinaryTableInputFileFormatter(out_type=out_type,dtype=dtype,columns=columns,packing=packing, preamble=preamble,skip_bytes=skip_bytes) data_file=file_format.read(location_file) return data_file if return_file else data_file.data
[docs]def load_bin_desc(path=None, loc="file", encoding=None, return_file=False): """ Load data from the binary file with a description. Analogous to :func:`load_dict`, but doesn't allow any additional parameters (which don't matter in this case). Args: path (str): path to the file of a file-like object loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details) encoding: if a new file location is opened, this specifies the encoding return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data """ return load_dict(path=path,loc=loc,encoding=encoding,return_file=return_file)
[docs]def load_dict(path=None, case_normalization=None, inline_dtype="generic", entry_format="value", inline_out_type="default", skip_lines=0, allow_duplicate_keys=False, loc="file", encoding=None, return_file=False): """ Load data from the dictionary file. Args: path (str): path to the file of a file-like object case_normalization (str): If ``None``, the dictionary paths are case-sensitive; otherwise, defines the way the entries are normalized (``'lower'`` or ``'upper'``). inline_dtype (str): dtype for inlined tables. inline_out_type (str): type of the result of the inline table: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame, ``'raw'`` for raw :class:`.InlineTable` data containing tuple ``(column_data, column_names)``, or ``'default'`` (determined by the library default; ``'pandas'`` by default). entry_format (str): Determines the way for dealing with :class:`.dict_entry.IDictionaryEntry` objects (objects transformed into dictionary branches with special recognition rules). Can be ``'branch'`` (don't attempt to recognize those object, leave dictionary as in the file), ``'dict_entry'`` (recognize and leave as :class:`.dict_entry.IDictionaryEntry` objects) or ``'value'`` (recognize and keep the value). allow_duplicate_keys (bool): if ``False`` and the same key is mentioned twice in the file, raise and error skip_lines (int): Number of lines to skip from the beginning of the file. loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details) encoding: if a new file location is opened, this specifies the encoding return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data """ location_file=location.LocationFile(location.get_location(path,loc,encoding=encoding)) file_format=DictionaryInputFileFormat(case_normalization=case_normalization, inline_dtype=inline_dtype,inline_out_type=inline_out_type, entry_format=entry_format,skip_lines=skip_lines,allow_duplicate_keys=allow_duplicate_keys) data_file=file_format.read(location_file) return data_file if return_file else data_file.data
[docs]def load_generic(path=None, file_format=None, loc="file", encoding=None, return_file=False, **kwargs): """ Load data from the file. Args: path (str): path to the file of a file-like object file_format (str): input file format; if ``None``, attempt to auto-detect file format (same as ``'generic'``); can also be an :class:`IInputFileFormat` instance for specific reading method loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details) encoding: if a new file location is opened, this specifies the encoding return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data `**kwargs` are passed to the file formatter used to read the data (see :class:`CSVTableInputFileFormat`, :class:`DictionaryInputFileFormat` and :class:`BinaryTableInputFileFormatter` for the possible arguments). The default format names are: - ``'generic'``: Generic file format. Attempt to autodetect, raise :exc:`IOError` if unsuccessful; - ``'txt'``: Generic text file. Attempt to autodetect, raise :exc:`IOError` if unsuccessful - ``'csv'``: CSV file, corresponds to :class:`CSVTableInputFileFormat`; - ``'dict'``: Dictionary file, corresponds to :class:`DictionaryInputFileFormat`; - ``'bin'``: Binary file, corresponds to :class:`BinaryTableInputFileFormatter` """ location_file=location.LocationFile(location.get_location(path,loc,encoding=encoding)) file_format=build_file_format(location_file,file_format=file_format,**kwargs) data_file=file_format.read(location_file) return data_file if return_file else data_file.data