"""Utilities for saving and loading dictionaries, tabular text data, and sparse matrices.
This module provides lightweight I/O helpers used by examples and scripts:
- store Python dictionaries with ``pickle`` or NumPy ``.npz`` archives,
- append data series to a simple comma-separated text file format,
- read tabular text files back into NumPy arrays,
- export sparse matrices to a human-readable ``.dat`` file.
"""
import os
import pickle
import numpy as np
from .checks import validate_parameters
__all__ = [
"save_dictionary",
"load_dictionary",
"save_data_in_textfile",
"load_data_from_textfile",
"save_sparse_matrix_to_dat",
]
def _normalize_dictionary_format(filename, file_format):
"""Normalize the dictionary archive format string.
Parameters
----------
filename : str
Target file name used to infer the format when ``file_format`` is not
provided explicitly.
file_format : str or None
Requested archive format. Supported values are ``"pickle"``/``"pkl"``
and ``"npz"``.
Returns
-------
str
Normalized format label.
Raises
------
TypeError
If ``file_format`` is neither ``None`` nor a string.
ValueError
If the requested format is not supported.
"""
if file_format is None:
extension = os.path.splitext(filename)[1].lower()
if extension == ".npz":
return "npz"
return "pickle"
if not isinstance(file_format, str):
raise TypeError(f"file_format should be STRING or None, not {type(file_format)}")
normalized = file_format.lower()
aliases = {"pickle": "pickle", "pkl": "pickle", "npz": "npz"}
if normalized not in aliases:
raise ValueError(
f"Unsupported file_format '{file_format}'. Supported formats are 'pickle' and 'npz'."
)
return aliases[normalized]
def _resolve_dictionary_filename(filename, file_format):
"""Return the on-disk filename associated with the chosen archive format."""
if file_format == "npz" and not filename.endswith(".npz"):
return f"{filename}.npz"
return filename
[docs]
def save_dictionary(dictionary, filename, file_format=None, compressed=True):
"""Serialize a dictionary to a pickle file or NumPy ``.npz`` archive.
Parameters
----------
dictionary : dict
Dictionary to save.
filename : str
Output file path. If ``file_format`` is omitted, the archive format is
inferred from the extension: ``.npz`` selects the NumPy archive path,
while every other extension defaults to pickle.
file_format : str or None, optional
Explicit archive format. Supported values are ``"pickle"``/``"pkl"``
and ``"npz"``. When omitted, the format is inferred from ``filename``.
compressed : bool, optional
Compression flag used only for ``.npz`` archives. It is ignored for
pickle output.
Returns
-------
str
Final archive filename written to disk. For ``.npz`` archives the
suffix is appended automatically when missing.
Raises
------
TypeError
If ``dictionary``, ``filename``, ``file_format``, or ``compressed`` has
an invalid type.
ValueError
If ``file_format`` is unsupported or if a value cannot be represented
safely in ``.npz`` format.
Notes
-----
The ``.npz`` path is intended for flat dictionaries of array-like values.
Nested Python objects should be stored with pickle instead.
"""
validate_parameters(dictionary=dictionary, filename=filename)
if not isinstance(compressed, bool):
raise TypeError(f"compressed should be BOOL, not a {type(compressed)}")
normalized_format = _normalize_dictionary_format(filename, file_format)
output_file = _resolve_dictionary_filename(filename, normalized_format)
if normalized_format == "pickle":
with open(output_file, "wb") as outp: # Overwrites any existing file.
pickle.dump(dictionary, outp, pickle.HIGHEST_PROTOCOL)
return output_file
payload = {}
for key, value in dictionary.items():
if not isinstance(key, str):
raise TypeError(
f"Dictionary keys must be STRINGs for npz output, not {type(key)}"
)
array_value = np.asarray(value)
if array_value.dtype == object:
raise ValueError(
"npz output supports only values convertible to non-object NumPy arrays; "
"use pickle for nested Python objects."
)
payload[key] = array_value
save_fn = np.savez_compressed if compressed else np.savez
save_fn(output_file, **payload)
return output_file
[docs]
def load_dictionary(filename, file_format=None):
"""Load a dictionary stored with pickle or as a NumPy ``.npz`` archive.
Parameters
----------
filename : str
Path to the stored dictionary. If ``file_format`` is omitted, ``.npz``
files are detected from the extension. When ``file_format="npz"``, the
suffix is added automatically if missing. For convenience, a bare file
name also resolves to ``filename + ".npz"`` when that archive exists.
file_format : str or None, optional
Explicit archive format. Supported values are ``"pickle"``/``"pkl"``
and ``"npz"``. When omitted, the format is inferred from ``filename``.
Returns
-------
dict
Deserialized dictionary. For ``.npz`` archives, 0-dimensional arrays
are converted back to Python scalars while higher-dimensional entries
are returned as NumPy arrays.
Raises
------
TypeError
If ``filename`` or ``file_format`` has an invalid type.
ValueError
If ``file_format`` is unsupported.
"""
validate_parameters(filename=filename)
if (
file_format is None
and os.path.splitext(filename)[1] == ""
and not os.path.exists(filename)
and os.path.exists(f"{filename}.npz")
):
normalized_format = "npz"
input_file = f"{filename}.npz"
else:
normalized_format = _normalize_dictionary_format(filename, file_format)
input_file = _resolve_dictionary_filename(filename, normalized_format)
if normalized_format == "pickle":
with open(input_file, "rb") as outp:
return pickle.load(outp)
with np.load(input_file, allow_pickle=False) as archive:
data = {}
for key in archive.files:
value = archive[key]
data[key] = value.item() if value.shape == () else value.copy()
return data
[docs]
def save_data_in_textfile(data_file, x_data, new_data):
"""Append a data series as a new column in a simple text table.
The file format is line-based and comma-separated. On first write, the file
is created and initialized with the values from ``x_data`` (usually an
x-axis label followed by x values). On subsequent writes, each line gets one
extra comma-separated entry from ``new_data``.
Parameters
----------
data_file : str
Path to the text file to create/update.
x_data : list
Reference column written when the file does not yet exist.
Conventionally the first item is a label and the remaining items are x
values.
new_data : list
Column to append to the file. It must contain the same number of entries
as the current file line count. Conventionally the first item is a label.
Returns
-------
None
Raises
------
TypeError
If the input arguments have invalid types.
Notes
-----
This helper assumes ``x_data`` and ``new_data`` are already aligned line by
line. It does not validate lengths or parse values.
"""
if not isinstance(data_file, str):
raise TypeError(f"data_file should be a STRING, not a {type(data_file)}")
if not isinstance(x_data, list):
raise TypeError(f"x_data must be a LIST, not a {type(x_data)}")
if not isinstance(new_data, list):
raise TypeError(f"new_data must be a LIST, not a {type(new_data)}")
# STORE X VALUES
if not os.path.exists(data_file):
with open(data_file, "w+", encoding="utf-8") as data_handle:
for entry in x_data:
data_handle.write(f"{entry}\n")
# STORE NEW Y VALUES
with open(data_file, "r", encoding="utf-8") as data_handle:
lines = data_handle.readlines()
with open(data_file, "w+", encoding="utf-8") as data_handle:
for index, line_entry in enumerate(lines):
stripped_line = line_entry.rstrip()
data_handle.write(f"{stripped_line},{new_data[index]}\n")
[docs]
def load_data_from_textfile(data_file_name, row_for_labels=False):
"""Load columnar numeric data from a comma-separated text file.
Parameters
----------
data_file_name : str
Path to the input file.
row_for_labels : bool, optional
If ``True``, interpret the first line as column labels and store them as
``"label_0"``, ``"label_1"``, ... entries in the returned dictionary.
Default is ``False``.
Returns
-------
dict
Dictionary containing one NumPy array per column under string keys
``"0"``, ``"1"``, ... . If ``row_for_labels`` is ``True``, label entries
are also included.
Raises
------
TypeError
If ``data_file_name`` or ``row_for_labels`` has an invalid type.
Notes
-----
Data values are converted to ``float``. This helper expects a regular
comma-separated table with the same number of columns on each line.
"""
if not isinstance(data_file_name, str):
raise TypeError(
f"data_file_name should be a STRING, not a {type(data_file_name)}"
)
if not isinstance(row_for_labels, bool):
raise TypeError(f"row_for_labels must be a BOOL, not a {type(row_for_labels)}")
# Open the file and acquire all the lines
with open(data_file_name, "r", encoding="utf-8") as data_handle:
lines = data_handle.readlines()
# CREATE A DICTIONARY TO HOST THE LISTS OBTAINED FROM EACH COLUMN OF data_file
data = {}
# Get the first line of the File as a list of entries.
first_row_entries = lines[0].strip().split(",")
for column_index, entry in enumerate(first_row_entries):
# Generate a list for each column of data_file
data[str(column_index)] = []
if row_for_labels:
# Generate a label for each list using the corresponding first-row entry.
data[f"label_{column_index}"] = str(entry)
if row_for_labels:
# IGNORE THE FIRST LINE OF lines (ALREADY USED FOR THE LABELS)
del lines[0]
# Fill the lists with the entries of Columns
for line_entry in lines:
row_entries = line_entry.strip().split(",")
for column_index, _ in enumerate(first_row_entries):
data[str(column_index)].append(float(row_entries[column_index]))
for column_index, _ in enumerate(first_row_entries):
data[str(column_index)] = np.asarray(data[str(column_index)])
return data
[docs]
def save_sparse_matrix_to_dat(sparse_matrix, filename):
"""Export a sparse matrix to a human-readable ``.dat`` text file.
The output contains:
- a header line with the matrix dimension,
- one line per non-zero entry with row/column indices and complex value.
Parameters
----------
sparse_matrix : scipy.sparse.spmatrix
Sparse matrix to export.
filename : str
Output file path.
Returns
-------
None
Raises
------
TypeError
If ``sparse_matrix`` or ``filename`` has an invalid type.
"""
validate_parameters(op_list=[sparse_matrix], filename=filename)
with open(filename, "w", encoding="utf-8") as data_handle:
# Write the dimension of the matrix
data_handle.write("# dimension\n")
data_handle.write(f"{sparse_matrix.shape[0]}\n")
# Write the non-zero elements
data_handle.write("# Non-zero elements: coordinates and coefficients\n")
coo = sparse_matrix.tocoo()
for row_index, col_index, value in zip(coo.row, coo.col, coo.data):
data_handle.write(
f"{row_index}, {col_index}; ({value.real}, {value.imag})\n"
)