Source code for pylablib.core.fileio.loadfile

"""
Utilities for reading data files.
"""

from . import datafile, location, dict_entry, parse_csv, loadfile_utils
from ..utils import funcargparse, library_parameters

import numpy as np

library_parameters.library_parameters.update({"fileio/loadfile/csv/out_type":"pandas"},overwrite=False)
library_parameters.library_parameters.update({"fileio/loadfile/dict/inline_out_type":"pandas"},overwrite=False)


##### File formats #####


[docs]class IInputFileFormat:
    """
    Generic class for an input file format.
    
    Based on `file_format` or autodetection, calls one of its subclasses to read the file.

    Defines a single static method
    """
[docs]    @staticmethod
    def detect_file_format(location_file):
        with location_file.open("rb") as stream:
            is_binary=loadfile_utils.detect_binary_file(stream)
        if is_binary:
            return BinaryTableInputFileFormatter
        else:
            return ITextInputFileFormat.detect_file_format(location_file)
[docs]    def read(self, location_file):
        """Read a file at a given location"""
        raise NotImplementedError("{}.{}".format(self.__class__.__name__,"read"))
    
    
[docs]class ITextInputFileFormat(IInputFileFormat):  # pylint: disable=abstract-method
    """
    Generic class for a text input file format.
    
    Based on `file_format` or autodetection, calls one of its subclasses to read the file.
    """
[docs]    @staticmethod
    def detect_file_format(location_file):
        with location_file.open("r") as stream:
            txt_type=loadfile_utils.detect_textfile_type(stream)
        if txt_type=="table":
            return CSVTableInputFileFormat
        elif txt_type=="dict":
            return DictionaryInputFileFormat
        else:
            raise IOError("can't detect file type")
            
            
[docs]class CSVTableInputFileFormat(ITextInputFileFormat):
    """
    Class for CSV input file format.

    Args:
        out_type (str): type of the result: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame,
            or ``'default'`` (determined by the library default; ``'pandas'`` by default)
        dtype: dtype of entries; can be either a single type, or a list of types (one per column).
            Possible dtypes are: ``'int'``, ``'float'``, ``'complex'``,
            ``'numeric'`` (tries to coerce to minimal possible numeric type, raises error if data can't be converted to `complex`),
            ``'generic'`` (accept arbitrary types, including lists, dictionaries, escaped strings, etc.),
            ``'raw'`` (keep raw string).
        columns: either a number if columns, or a list of columns names.
        delimiters (str): Regex string which recognizes entries delimiters (by default ``r"\\s*,\\s*|\\s+"``, i.e., commas and whitespaces).
        empty_entry_substitute: Substitute for empty table entries. If ``None``, all empty table entries are skipped.
        ignore_corrupted_lines (bool): If ``True``, skip corrupted (e.g., non-numeric for numeric dtype, or with too few entries) lines;
            otherwise, raise :exc:`ValueError`.
        skip_lines (int): Number of lines to skip from the beginning of the file.
    """
    def __init__(self, out_type="default", dtype="numeric", columns=None, delimiters=None, empty_entry_substitute=None, ignore_corrupted_lines=True, skip_lines=0):
        ITextInputFileFormat.__init__(self)
        self.out_type=library_parameters.library_parameters["fileio/loadfile/csv/out_type"] if out_type=="default" else out_type
        self.dtype=dtype
        self.columns=columns
        self.delimiters=delimiters or parse_csv._table_delimiters
        self.empty_entry_substitute=empty_entry_substitute
        self.ignore_corrupted_lines=ignore_corrupted_lines
        self.skip_lines=skip_lines
[docs]    def read(self, location_file):
        with location_file.open("r") as stream:
            for _ in range(self.skip_lines):
                stream.readline()
            data,comments,corrupted=parse_csv.read_table(stream,dtype=self.dtype,columns=self.columns,out_type=self.out_type,
                            delimiters=self.delimiters,empty_entry_substitute=self.empty_entry_substitute,ignore_corrupted_lines=self.ignore_corrupted_lines)
        if self.out_type in {"pandas"} and not funcargparse.is_sequence(self.columns,"builtin;nostring") and len(data)>0:
            columns,comment_idx=loadfile_utils.find_columns_lines(corrupted,comments,data.shape[1])
            if comment_idx is not None:
                del comments[comment_idx]
            if columns is not None:
                data.columns=columns
        creation_time=loadfile_utils.find_savetime_comment(comments)
        return datafile.DataFile(data=data,comments=comments,creation_time=creation_time,filetype="csv")
    

[docs]class DictionaryInputFileFormat(ITextInputFileFormat):
    """
    Class for Dictionary input file format.
        
    Args:
        location_file: Location of the data.
        case_normalization (str): If ``None``, the dictionary paths are case-sensitive;
            otherwise, defines the way the entries are normalized (``'lower'`` or ``'upper'``).
        inline_dtype (str): dtype for inlined tables.
        inline_out_type (str): type of the result of the inline table:
            ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame,
            ``'raw'`` for raw :class:`.InlineTable` data containing tuple ``(column_data, column_names)``,
            or ``'default'`` (determined by the library default; ``'pandas'`` by default).
        entry_format (str): Determines the way for dealing with :class:`.dict_entry.IDictionaryEntry` objects
            (objects transformed into dictionary branches with special recognition rules). Can be
            ``'branch'`` (don't attempt to recognize those object, leave dictionary as in the file),
            ``'dict_entry'`` (recognize and leave as :class:`.dict_entry.IDictionaryEntry` objects) or
            ``'value'`` (recognize and keep the value).
        allow_duplicate_keys (bool): if ``False`` and the same key is mentioned twice in the file, raise and error
        skip_lines (int): Number of lines to skip from the beginning of the file.
    """
    def __init__(self, case_normalization=None, inline_dtype="generic", inline_out_type="default", entry_format="value", allow_duplicate_keys=False, skip_lines=0):
        ITextInputFileFormat.__init__(self)
        self.case_normalization=case_normalization
        self.inline_dtype=inline_dtype
        self.inline_out_type=library_parameters.library_parameters["fileio/loadfile/dict/inline_out_type"] if inline_out_type=="default" else inline_out_type
        if not entry_format in {"branch","dict_entry","value"}:
            raise ValueError("unrecognized entry format: {0}".format(entry_format))
        self.entry_format=entry_format
        self.allow_duplicate_keys=allow_duplicate_keys
        self.skip_lines=skip_lines
[docs]    def read(self, location_file):
        with location_file.open("r") as stream:
            for _ in range(self.skip_lines):
                stream.readline()
            data,comments=loadfile_utils.read_dict_and_comments(stream,inline_dtype=self.inline_dtype,
                case_normalization=self.case_normalization,allow_duplicate_keys=self.allow_duplicate_keys)
        creation_time=loadfile_utils.find_savetime_comment(comments)
        def map_entries(ptr):
            if dict_entry.is_dict_entry_branch(ptr):
                entry=dict_entry.from_dict(ptr,location_file.loc)
                if self.entry_format=="value":
                    entry=entry.data
                return entry
            else:
                return ptr
        if self.entry_format!="branch":
            data.map_self(map_entries,to_visit="branches",topdown=False)
        def map_inline_tables(ptr):
            if not dict_entry.is_dict_entry_branch(ptr):  # check if there is an inline table not in the entry
                for k,v in ptr.items():
                    if isinstance(v,loadfile_utils.InlineTable):
                        ptr[k],_=dict_entry.parse_stored_table_data(data=v,out_type=self.inline_out_type)
            return ptr
        if self.inline_out_type!="raw":
            data.map_self(map_inline_tables,to_visit="branches",topdown=False)
        if len(data)==1 and list(data.keys())==["__data__"]: # special case of files with preamble
            data=data["__data__"]
        return datafile.DataFile(data=data,comments=comments,creation_time=creation_time,filetype="dict")
    
    
[docs]class BinaryTableInputFileFormatter(IInputFileFormat):
    """
    Class for binary input file format.

    Args:
        location_file: Location of the data.
        out_type (str): type of the result: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame,
            or ``'default'`` (determined by the library default; ``'pandas'`` by default)
        dtype: :class:`numpy.dtype` describing the data.
        columns: either number if columns, or a list of columns names.
        packing (str): The way the 2D array is packed. Can be either
            ``'flatten'`` (data is stored row-wise) or
            ``'transposed'`` (data is stored column-wise).
        preamble (dict): If not ``None``, defines binary file parameters that supersede the parameters supplied to the function.
            The defined parameters are ``'dtype'``, ``'packing'``, ``'ncols'`` (number of columns) and ``'nrows'`` (number of rows).
        skip_bytes (int): Number of bytes to skip from the beginning of the file.
    """
    def __init__(self, out_type="default", dtype="<f8", columns=None, packing="flatten", preamble=None, skip_bytes=0):
        IInputFileFormat.__init__(self)
        self.out_type=library_parameters.library_parameters["fileio/loadfile/csv/out_type"] if out_type=="default" else out_type
        self.preamble=preamble or {}
        self.dtype=self.preamble.get("dtype",dtype)
        try:
            self.columns_num=len(columns)
            self.columns=columns
        except TypeError:
            self.columns_num=columns
            self.columns=None
        self.packing=self.preamble.get("packing",packing)
        self.skip_bytes=skip_bytes
        self.preamble_columns_num=self.preamble.get("ncols",None)
        if self.columns_num is None:
            self.columns_num=self.preamble_columns_num
        elif self.preamble_columns_num is not None and self.preamble_columns_num!=self.columns_num:
            raise ValueError("supplied columns number {0} disagrees with extracted form preamble {1}".format(self.columns_num,self.preamble_columns_num))
        self.preamble_rows_num=self.preamble.get("nrows",None)
[docs]    def read(self, location_file):
        with location_file.open("rb") as stream:
            if self.skip_bytes:
                stream.seek(self.skip_bytes,1)
            data=np.fromfile(stream,dtype=self.dtype)
        if self.columns_num is not None:
            if self.packing=="flatten":
                data=data.reshape((-1,self.columns_num))
            elif self.packing=="transposed":
                data=data.reshape((self.columns_num,-1)).transposed()
            else:
                raise ValueError("unrecognized packing method: {0}".format(self.packing))
        else:
            data=data[None,:]
        if self.preamble_rows_num is not None and len(data)!=self.preamble_rows_num:
            raise ValueError("supplied rows number {0} disagrees with extracted form preamble {1}".format(len(data),self.preamble_rows_num))
        if self.out_type=="pandas":
            data=data.astype(data.dtype.type,copy=False) # convert to native byteorder (required for pandas indexing)
        data=parse_csv.columns_to_table([data[:,i] for i in range(data.shape[1])],columns=self.columns,out_type=self.out_type)
        return datafile.DataFile(data=data,filetype="bin")
        




[docs]def build_file_format(location_file, file_format="generic", **kwargs):
    """
    Create file format (:class:`IInputFileFormat` instance) for given parameters and file locations.

    If ``file_format`` is already an instance of :class:`IInputFileFormat`, return unchanged.
    If ``file_format`` is generic (e.g., ``"generic"`` or ``"test"``), attempt to autodetect it from the file.
    ``**kwargs`` are passed to the file format constructor.
    """
    if isinstance(file_format,IInputFileFormat):
        return file_format
    if file_format in {"generic",None}:
        return IInputFileFormat.detect_file_format(location_file)(**kwargs)
    elif file_format=="text":
        return ITextInputFileFormat.detect_file_format(location_file)(**kwargs)
    elif file_format=="csv":
        return CSVTableInputFileFormat(**kwargs)
    elif file_format=="dict":
        return DictionaryInputFileFormat(**kwargs)
    elif file_format=="bin":
        return BinaryTableInputFileFormatter(**kwargs)
    else:
        raise ValueError("unrecognized file format: {}".format(file_format))






[docs]def load_csv(path=None, out_type="default", dtype="numeric", columns=None, delimiters=None, empty_entry_substitute=None, ignore_corrupted_lines=True, skip_lines=0, loc="file", encoding=None, return_file=False):
    """
    Load data table from a CSV/table file.

    Args:
        path (str): path to the file of a file-like object
        out_type (str): type of the result: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame,
            or ``'default'`` (determined by the library default; ``'pandas'`` by default)
        dtype: dtype of entries; can be either a single type, or a list of types (one per column).
            Possible dtypes are: ``'int'``, ``'float'``, ``'complex'``,
            ``'numeric'`` (tries to coerce to minimal possible numeric type, raises error if data can't be converted to `complex`),
            ``'generic'`` (accept arbitrary types, including lists, dictionaries, escaped strings, etc.),
            ``'raw'`` (keep raw string).
        columns: either a number if columns, or a list of columns names
        delimiters (str): regex string which recognizes entries delimiters (by default ``r"\\s*,\\s*|\\s+"``, i.e., commas and whitespaces)
        empty_entry_substitute: substitute for empty table entries. If ``None``, all empty table entries are skipped
        ignore_corrupted_lines (bool): if ``True``, skip corrupted (e.g., non-numeric for numeric dtype, or with too few entries) lines;
            otherwise, raise :exc:`ValueError`
        skip_lines (int): number of lines to skip from the beginning of the file
        loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details)
        encoding: if a new file location is opened, this specifies the encoding
        return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data
    """
    location_file=location.LocationFile(location.get_location(path,loc,encoding=encoding))
    file_format=CSVTableInputFileFormat(out_type=out_type,dtype=dtype,columns=columns,delimiters=delimiters,
        empty_entry_substitute=empty_entry_substitute,ignore_corrupted_lines=ignore_corrupted_lines,skip_lines=skip_lines)
    data_file=file_format.read(location_file)
    return data_file if return_file else data_file.data

[docs]def load_csv_desc(path=None, loc="file", encoding=None, return_file=False):
    """
    Load data from the extended CSV table file.

    Analogous to :func:`load_dict`, but doesn't allow any additional parameters (which don't matter in this case).

    Args:
        path (str): path to the file of a file-like object
        loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details)
        encoding: if a new file location is opened, this specifies the encoding
        return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data
    """
    return load_dict(path=path,loc=loc,encoding=encoding,return_file=return_file)

[docs]def load_bin(path=None, out_type="default", dtype="<f8", columns=None, packing="flatten", preamble=None, skip_bytes=0, loc="file", encoding=None, return_file=False):
    """
    Load data from the binary file.

    Args:
        path (str): path to the file of a file-like object
        out_type (str): type of the result: ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame,
            or ``'default'`` (determined by the library default; ``'pandas'`` by default)
        dtype: :class:`numpy.dtype` describing the data.
        columns: either number if columns, or a list of columns names.
        packing (str): The way the 2D array is packed. Can be either
            ``'flatten'`` (data is stored row-wise) or
            ``'transposed'`` (data is stored column-wise).
        preamble (dict): If not ``None``, defines binary file parameters that supersede the parameters supplied to the function.
            The defined parameters are ``'dtype'``, ``'packing'``, ``'ncols'`` (number of columns) and ``'nrows'`` (number of rows).
        skip_bytes (int): Number of bytes to skip from the beginning of the file.
        loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details)
        encoding: if a new file location is opened, this specifies the encoding
        return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data
    """
    location_file=location.LocationFile(location.get_location(path,loc,encoding=encoding))
    file_format=BinaryTableInputFileFormatter(out_type=out_type,dtype=dtype,columns=columns,packing=packing,
        preamble=preamble,skip_bytes=skip_bytes)
    data_file=file_format.read(location_file)
    return data_file if return_file else data_file.data

[docs]def load_bin_desc(path=None, loc="file", encoding=None, return_file=False):
    """
    Load data from the binary file with a description.

    Analogous to :func:`load_dict`, but doesn't allow any additional parameters (which don't matter in this case).

    Args:
        path (str): path to the file of a file-like object
        loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details)
        encoding: if a new file location is opened, this specifies the encoding
        return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data
    """
    return load_dict(path=path,loc=loc,encoding=encoding,return_file=return_file)

[docs]def load_dict(path=None, case_normalization=None, inline_dtype="generic", entry_format="value", inline_out_type="default", skip_lines=0, allow_duplicate_keys=False, loc="file", encoding=None, return_file=False):
    """
    Load data from the dictionary file.

    Args:
        path (str): path to the file of a file-like object
        case_normalization (str): If ``None``, the dictionary paths are case-sensitive;
            otherwise, defines the way the entries are normalized (``'lower'`` or ``'upper'``).
        inline_dtype (str): dtype for inlined tables.
        inline_out_type (str): type of the result of the inline table:
            ``'array'`` for numpy array, ``'pandas'`` for pandas DataFrame,
            ``'raw'`` for raw :class:`.InlineTable` data containing tuple ``(column_data, column_names)``,
            or ``'default'`` (determined by the library default; ``'pandas'`` by default).
        entry_format (str): Determines the way for dealing with :class:`.dict_entry.IDictionaryEntry` objects
            (objects transformed into dictionary branches with special recognition rules). Can be
            ``'branch'`` (don't attempt to recognize those object, leave dictionary as in the file),
            ``'dict_entry'`` (recognize and leave as :class:`.dict_entry.IDictionaryEntry` objects) or
            ``'value'`` (recognize and keep the value).
        allow_duplicate_keys (bool): if ``False`` and the same key is mentioned twice in the file, raise and error
        skip_lines (int): Number of lines to skip from the beginning of the file.
        loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details)
        encoding: if a new file location is opened, this specifies the encoding
        return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data
    """
    location_file=location.LocationFile(location.get_location(path,loc,encoding=encoding))
    file_format=DictionaryInputFileFormat(case_normalization=case_normalization,
        inline_dtype=inline_dtype,inline_out_type=inline_out_type,
        entry_format=entry_format,skip_lines=skip_lines,allow_duplicate_keys=allow_duplicate_keys)
    data_file=file_format.read(location_file)
    return data_file if return_file else data_file.data




[docs]def load_generic(path=None, file_format=None, loc="file", encoding=None, return_file=False, **kwargs):
    """
    Load data from the file.
    
    Args:
        path (str): path to the file of a file-like object
        file_format (str): input file format; if ``None``, attempt to auto-detect file format (same as ``'generic'``);
            can also be an :class:`IInputFileFormat` instance for specific reading method
        loc (str): location type (``"file"`` means the usual file location; see :func:`.location.get_location` for details)
        encoding: if a new file location is opened, this specifies the encoding
        return_file (bool): if ``True``, return :class:`.DataFile` object (contains some metainfo); otherwise, return just the file data
    
    `**kwargs` are passed to the file formatter used to read the data
    (see :class:`CSVTableInputFileFormat`, :class:`DictionaryInputFileFormat` and :class:`BinaryTableInputFileFormatter` for the possible arguments).
    The default format names are:
    
        - ``'generic'``: Generic file format. Attempt to autodetect, raise :exc:`IOError` if unsuccessful;
        - ``'txt'``: Generic text file. Attempt to autodetect, raise :exc:`IOError` if unsuccessful
        - ``'csv'``: CSV file, corresponds to :class:`CSVTableInputFileFormat`;
        - ``'dict'``: Dictionary file, corresponds to :class:`DictionaryInputFileFormat`;
        - ``'bin'``: Binary  file, corresponds to :class:`BinaryTableInputFileFormatter`
    """
    location_file=location.LocationFile(location.get_location(path,loc,encoding=encoding))
    file_format=build_file_format(location_file,file_format=file_format,**kwargs)
    data_file=file_format.read(location_file)
    return data_file if return_file else data_file.data