dataset.py

import datetime
import sys
import pathlib
import collections
import re
import warnings
import numpy as np

from enum import IntEnum

DEFAULT_NUM_FORMAT = "%f"
DEFAULT_FIELD_DELIM = ", "


class Formats(IntEnum):
    """File Format Indices (FFI)"""
    FFI1001 = 1001
    FFI2110 = 2110


class VariableType(IntEnum):
    IndependentVariable = 1
    IndependentBoundedVariable = 2
    AuxiliaryVariable = 3
    DependentVariable = 4


def vmiss_to_npnan(val, miss):
    """converts value to np.nan if is (almost) equal to miss"""
    val, miss = map(float, (val, miss))
    return np.NaN if np.isclose(val, miss) else val


class DataStore1001:
    def __init__(self, ivar, dvars):
        self.ivarname = ivar.shortname

        self.varnames = [ivar.shortname] + [x for x in dvars]
        self.missvals = {x: dvars[x].miss for x in dvars}
        self.missvals.update({self.ivarname: ivar.miss})

        self.data = None

    def __getitem__(self, s=slice(None)):
        return self.data[s]

    def addBulk(self, raw):
        nlines, nvars = raw.shape
        if not nvars == len(self.varnames):
            raise Exception("Number of data columns does not match variable count!")
        self._addBulk(raw, nlines)

    def addBulkFromTxt(self, raw):
        if not len(raw[0]) == len(self.varnames):
            raise Exception("Number of data columns does not match variable count!")
        self._addBulk(raw, len(raw))

    def _addBulk(self, raw, n):
        for cur in range(n):
            newdata = {x: raw[cur][i] for i, x in enumerate(self.varnames)}
            self.add(**newdata)

    def add(self, **kwargs):
        if not self.ivarname in kwargs.keys():
            raise Exception("Need independent variable data.")

        ivarvalue = vmiss_to_npnan(kwargs[self.ivarname], self.missvals[self.ivarname])

        newline = np.array(np.NaN, dtype=[(v, "f8") for v in self.varnames])
        for key in kwargs.keys():
            if key in self.varnames:
                newline[key] = vmiss_to_npnan(kwargs[key], self.missvals[key])

        if self.data is None:
            self.data = newline
            self.data = self.data.reshape(1)  # don't even ask
        else:
            if ivarvalue in self.data[self.ivarname]:
                raise Exception("Cannot replace data (yet).")
            else:
                self.data = np.append(self.data, newline)

    def denanify(self, d):
        dd = d.copy()
        for k, miss in self.missvals.items():
            dd[k][np.isnan(dd[k])] = miss
        return dd

    def write(
        self, f=sys.stdout, fmt=DEFAULT_NUM_FORMAT, delimiter=DEFAULT_FIELD_DELIM
    ):
        d = self.denanify(self.data)
        np.savetxt(f, d, fmt=fmt, delimiter=delimiter)


class DataStore2110(collections.UserDict):
    def __init__(self, ivar, ibvar, auxvars, dvars):
        self.ivarname = ivar.shortname
        self.ibvarname = ibvar.shortname

        self.auxvarnames = [x for x in auxvars]
        self.dvarnames = [x for x in dvars]

        self.missvals = {x: dvars[x].miss for x in dvars}
        self.missvals.update({x: auxvars[x].miss for x in auxvars})
        self.missvals.update({self.ibvarname: ibvar.miss})
        self.missvals.update({self.ivarname: ivar.miss})

        self.nauxvarname = self.auxvarnames[0]  # convention!

        self.data = {}

        self.ivar = ivar
        self.auxvars = auxvars
        self.ibvar = ibvar
        self.dvars = dvars

    def __getitem__(self, s=slice(None)):
        return self.data[s]

    def _addAuxline(self, auxline):
        newdata = {
            x: auxline[i] for i, x in enumerate([self.ivarname] + self.auxvarnames)
        }
        self.add(**newdata)

    def addBulkDep(self, ivar, raw):
        nlines, nvars = raw.shape # nvars not used
        self._addDeplines(ivar, raw, nlines)

    def _addDeplines(self, ivar, raw, n):
        for cur in range(n):
            newdata = {
                x: raw[cur][i] for i, x in enumerate([self.ibvarname] + self.dvarnames)
            }
            newdata.update({self.ivarname: ivar})
            self.add(**newdata)

    def addBulkFromTxt(self, raw):
        self._addBulk(raw, len(raw))

    def _addBulk(self, raw, n):
        cur = 0
        while cur < n:
            ivarvalue = vmiss_to_npnan(raw[cur][0], self.missvals[self.ivarname])

            self._addAuxline(raw[cur])
            cur += 1

            # stupid, but at first auxline added, nprimaryData ist a 0-dim array...
            ndepData = self.data[ivarvalue]["AUX"][self.nauxvarname]
            ndepData = int(ndepData) if ndepData.shape == () else int(ndepData[-1])

            self._addDeplines(ivarvalue, raw[cur : (cur + ndepData)], ndepData)

            cur += ndepData

    def add(self, **kwargs):
        # whatever we do, an independent variable is needed
        if not self.ivarname in kwargs.keys():
            raise Exception("Need independent variable data.")

        ivarvalue = vmiss_to_npnan(kwargs[self.ivarname], self.missvals[self.ivarname])

        # this is an AUX line
        if any(x in self.auxvarnames for x in kwargs.keys()):
            # and we create the whole dataset if needed
            if not ivarvalue in self.data.keys():
                self.data[ivarvalue] = {
                    "AUX": DataStore1001(self.ivar, self.auxvars),
                    "DEP": DataStore1001(self.ibvar, self.dvars),
                }
            self.data[ivarvalue]["AUX"].add(**kwargs)

        # this is a DEP line
        if any(x in self.dvarnames for x in kwargs.keys()):
            if not self.ibvarname in kwargs.keys():
                raise Exception("Need independent (bounded) variable data.")

            if not ivarvalue in self.data.keys():
                raise Exception("Aux data line needs to be added first.")

            self.data[ivarvalue]["DEP"].add(**kwargs)

    def write(
        self, f=sys.stdout, fmt=DEFAULT_NUM_FORMAT, delimiter=DEFAULT_FIELD_DELIM
    ):
        for ivarvalue in self.data:
            self.data[ivarvalue]["AUX"].write(f, fmt=fmt, delimiter=delimiter)
            self.data[ivarvalue]["DEP"].write(f, fmt=fmt, delimiter=delimiter)


class KeywordComment:
    def __init__(self, key, naAllowed):
        self.key = key
        self.naAllowed = naAllowed
        self.data = []

    def append(self, data):
        self.data.append(data)

    def __str__(self):
        d = "\n".join(self.data) if not self.data is [] else "N/A" # TODO: use implicit boolean instead of "is []"
        return self.key + ": " + d


class StandardNormalComments(collections.UserList):
    @property
    def nlines(self):
        # "+ 1" -> shortnames line, and keywords might be multiline...
        return (
            len(self.freeform) + 1 + sum([len(k.data) for k in self.keywords.values()])
        )

    @property
    def data(self):
        return (
            self.freeform + [str(s) for s in self.keywords.values()] + [self.shortnames]
        )

    def ingest(self, raw):
        # last line is always shortname
        self.shortnames = raw.pop()

        # per standard: The free-form text section consists of the lines
        # between the beginning of the normal comments section
        # and the first required keyword. [...] The required “KEYWORD: value” pairs block
        # starts with the line that begins with the first required keyword
        # and must include all required “KEYWORD: value” pairs
        # in the order listed in the ICARTT documentation.

        currentKeyword = None
        for l in raw:
            possibleKeyword = l.split(":")[0].strip()
            if possibleKeyword in self.keywords or re.match(
                "R[a-zA-Z0-9]{1,2}[ ]*", possibleKeyword
            ):
                currentKeyword = possibleKeyword
                if not currentKeyword in self.keywords:  # for the revisions only...
                    self.keywords[currentKeyword] = KeywordComment(
                        currentKeyword, False
                    )

            if currentKeyword is None:
                self.freeform.append(l)
            else:
                self.keywords[currentKeyword].append(
                    l.replace(l.split(":")[0] + ":", "").strip()
                )

        for key in self.keywords:
            if self.keywords[key].data == []:
                warnings.warn(
                    "Normal comments: required keyword {:s} is missing.".format(key) # TODO: in genaral: use f-strings
                )

    def __init__(self):
        self.freeform = []
        self.shortnames = []

        requiredKeywords = (
            "PI_CONTACT_INFO",
            "PLATFORM",
            "LOCATION",
            "ASSOCIATED_DATA",
            "INSTRUMENT_INFO",
            "DATA_INFO",
            "UNCERTAINTY",
            "ULOD_FLAG",
            "ULOD_VALUE",
            "LLOD_FLAG",
            "LLOD_VALUE",
            "DM_CONTACT_INFO",
            "PROJECT_INFO",
            "STIPULATIONS_ON_USE",
            "OTHER_COMMENTS",
            "REVISION",
        )

        self.keywords = {k: KeywordComment(k, True) for k in requiredKeywords}

        self.keywords["UNCERTAINTY"].naAllowed = False
        self.keywords["REVISION"].naAllowed = False


class Variable:
    """An ICARTT variable description with name, units, scale and missing value.

    :param shortname: Short name of the variable
    :type shortname: str

    :param units: Units of the variable
    :type units: str

    :param standardname: Standard name of the variable
    :type standardname: str

    :param longname: Long name of the variable
    :type longname: str

    :param vartype: Variable type (unbounded/bounded independent or dependent)
    :type vartype: enum:`icartt.Formats`, defaults to VariableType.dependentVariable

    :param scale: Scaling factor for the variable
    :type scale: float, defaults to 1.0

    :param miss: Missing value for the variable
    :type miss: float, defaults to -99999.0
    """

    def desc(self, splitChar=","):
        """Variable description string as it appears in an ICARTT file

        :return: description string
        :rtype: str
        """
        descstr = [str(self.shortname), str(self.units)]
        if not self.standardname is None:
            descstr += [str(self.standardname)]
        if not self.longname is None:
            descstr += [str(self.longname)]
        return splitChar.join(descstr)

    def isValidVariablename(self, name):
        # ICARTT Standard v2 2.1.1 2)
        # Variable short names and variable standard names:
        # Uppercase and lowercase ASCII alphanumeric characters
        # and underscores.
        def isAsciiAlphaOrUnderscore(x):
            return re.match("[a-zA-Z0-9_]", x)

        allAreAlphaOrUnderscore = all([isAsciiAlphaOrUnderscore(x) for x in name])
        # The first character must be a letter,
        firstIsAlpha = bool(re.match("[a-zA-Z]", name[0]))
        # and the name can be at most 31 characters in length.
        lessThan31Chars = len(name) <= 31

        return allAreAlphaOrUnderscore and firstIsAlpha and lessThan31Chars

    def __init__(
        self,
        shortname,
        units,
        standardname,
        longname,
        vartype=VariableType.DependentVariable,
        scale=1.0,
        miss=-99999.0,
    ):
        """Constructor method"""
        if not self.isValidVariablename(shortname):
            warnings.warn(
                "Variable short name {:s} does not comply with ICARTT standard v2".format(
                    shortname
                )
            )

        self.shortname = shortname
        self.units = units
        self.standardname = standardname
        self.longname = longname
        self.vartype = vartype
        self.scale = scale
        self.miss = miss

    # TODO: should this also get a __str__ ?

    def __repr__(self): # TODO: this could be more meaningful?
        return "ICARTT Variable description"


class Dataset:
    """An ICARTT dataset that can be created from scratch or read from a file,
    manipulated, and then written to a file.

    :param f: file path or file handle to use
    :type f: str or file handle or stream object, defaults to None

    :param loadData: load data as well (or only header if False)?
    :type loadData: bool, defaults to "True"

    :param splitChar: splitting character used to separate fields in a line
    :type splitChar: str, defaults to ","

    :param format:
    """

    # TODO: should this also get a __str__  and a __repr__ ?

    @property
    def nHeader(self):
        """Header line count

        :return: line count
        :rtype: int
        """
        total = -1
        if self.format == Formats.FFI1001:
            total = (
                14
                + len(self.dependentVariables)
                + len(self.specialComments)
                + self.normalComments.nlines
            )
        if self.format == Formats.FFI2110:
            # 2: IVAR + IBVAR
            total = (
                16
                + 2
                + len(self.auxiliaryVariables)
                + len(self.dependentVariables)
                + len(self.specialComments)
                + self.normalComments.nlines
            )
        return total

    @property
    def times(self):
        """Time steps of the data

        :return: list of time steps
        :rtype: list
        """
        return [
            # TODO see commit f5208db0 - this will do unexpected things if
            #      self.dateOfCollection is a naive datetime object (think DST transitions...)
            self.dateOfCollection + datetime.timedelta(seconds=x)
            for x in self.independentVariable
        ]

    @property
    def variables(self):
        """Variables (independent + dependent + auxiliary)

        :return: dictionary of all variables
        :rtype: dict of Variable(s)
        """
        vars = {}  # TODO rename - "vars" is a Python built-in
        if not self.independentVariable is None: # TODO use "is not None" for readbility
            vars[self.independentVariable.shortname] = self.independentVariable
        if not self.independentBoundedVariable is None: # TODO use "is not None" for readbility
            vars[
                self.independentBoundedVariable.shortname
            ] = self.independentBoundedVariable

        vars = {**vars, **self.dependentVariables, **self.auxiliaryVariables}

        return vars

    def readHeader(self, splitChar=","):
        """Read the ICARTT header (from file)"""

        class FilehandleWithLinecounter:
            def __init__(self, f, splitChar):
                self.f = f
                self.line = 0
                self.splitChar = splitChar

            def readline(self, doSplit=True):
                self.line += 1
                dmp = self.f.readline().replace("\n", "").replace("\r", "")
                if doSplit:
                    dmp = [word.strip(" ") for word in dmp.split(self.splitChar)]
                return dmp

        if self.inputFhandle.closed:
            self.inputFhandle = open(self.inputFhandle.name, encoding='utf-8')

        # TODO: refactor following try/except statement
        # this one is challenging since we need an "unpopulated" file pointer if no file was specified :)
        try:
            f = FilehandleWithLinecounter(self.inputFhandle, splitChar)

            self._readHeader(f)

            del f
        except:
            a = 1
        finally:
            self.inputFhandle.close()

    def _readHeader(self, f):
        # line 1 - Number of lines in header, file format index (most files use
        # 1001) - comma delimited.
        dmp = f.readline()

        nHeaderSuggested = int(dmp[0])

        # TODO: refactor following try/except statement
        try:
            self.format = Formats(int(dmp[1]))
        except:
            raise ValueError("ICARTT format {:d} not implemented".format(dmp[1]))
            # TODO except clause could be re-written like
            # except ValueError as ve:
            #     raise ValueError(f"ICARTT format {dmp[1]:d} not implemented") from ve
            # to show the error trace

        if len(dmp) > 2:
            self.version = dmp[2]

        # line 2 - PI last name, first name/initial.
        self.PIName = f.readline(doSplit=False)

        # line 3 - Organization/affiliation of PI.
        self.PIAffiliation = f.readline(doSplit=False)

        # line 4 - Data source description (e.g., instrument name, platform name,
        # model name, etc.).
        self.dataSourceDescription = f.readline(doSplit=False)

        # line 5 - Mission name (usually the mission acronym).
        self.missionName = f.readline(doSplit=False)

        # line 6 - File volume number, number of file volumes (these integer values
        # are used when the data require more than one file per day; for data that
        # require only one file these values are set to 1, 1) - comma delimited.
        dmp = f.readline()
        self.fileVolumeNumber = int(dmp[0])
        self.totalNumberOfFileVolumes = int(dmp[1])

        # line 7 - UTC date when data begin, UTC date of data reduction or revision
        # - comma delimited (yyyy, mm, dd, yyyy, mm, dd).
        dmp = f.readline()
        self.dateOfCollection = datetime.datetime.strptime(
            "".join(["{:s}".format(x) for x in dmp[0:3]]), "%Y%m%d"
        )
        self.dateOfRevision = datetime.datetime.strptime(
            "".join(["{:s}".format(x) for x in dmp[3:6]]), "%Y%m%d"
        )

        # line 8 - Data Interval (This value describes the time spacing (in seconds)
        # between consecutive data records. It is the (constant) interval between
        # values of the independent variable. For 1 Hz data the data interval value
        # is 1 and for 10 Hz data the value is 0.1. All intervals longer than 1
        # second must be reported as Start and Stop times, and the Data Interval
        # value is set to 0. The Mid-point time is required when it is not at the
        # average of Start and Stop times. For additional information see Section
        # 2.5 below.).
        dmp = f.readline()
        # might have multiple entries for 2110
        self.dataIntervalCode = [float(x) for x in dmp]

        # line 9 - Description or name of independent variable (This is the name
        # chosen for the start time. It always refers to the number of seconds UTC
        # from the start of the day on which measurements began. It should be noted
        # here that the independent variable should monotonically increase even when
        # crossing over to a second day.

        def extractVardesc(dmp):
            shortname = dmp[0]
            units = dmp[1]
            standardname = dmp[2] if len(dmp) > 2 else None
            longname = dmp[3] if len(dmp) > 3 else None
            return shortname, units, standardname, longname

        if self.format == Formats.FFI2110:
            dmp = f.readline()
            shortname, units, standardname, longname = extractVardesc(dmp)
            self.independentBoundedVariable = Variable(
                shortname,
                units,
                standardname,
                longname,
                vartype=VariableType.IndependentBoundedVariable,
            )

        dmp = f.readline()
        shortname, units, standardname, longname = extractVardesc(dmp)
        self.independentVariable = Variable(
            shortname,
            units,
            standardname,
            longname,
            vartype=VariableType.IndependentVariable,
        )

        def readVars(f, vtype):
            # line 10 - Number of variables (Integer value showing the number of
            # dependent variables: the total number of columns of data is this value
            # plus one.).
            nvar = int(f.readline()[0])

            # line 11- Scale factors (1 for most cases, except where grossly
            # inconvenient) - comma delimited.
            vscale = [x for x in f.readline()]

            # line 12 - Missing data indicators (This is -9999 (or -99999, etc.) for
            # any missing data condition, except for the main time (independent)
            # variable which is never missing) - comma delimited.
            vmiss = [x for x in f.readline()]
            # no float casting here, as we need to do string comparison lateron when reading data...

            # line 13 - Variable names and units (Short variable name and units are
            # required, and optional long descriptive name, in that order, and separated
            # by commas. If the variable is unitless, enter the keyword "none" for its
            # units. Each short variable name and units (and optional long name) are
            # entered on one line. The short variable name must correspond exactly to
            # the name used for that variable as a column header, i.e., the last header
            # line prior to start of data.).
            dmp = f.readline()
            shortname, units, standardname, longname = extractVardesc(dmp)
            vshortname = [shortname]
            vunits = [units]
            vstandardname = [standardname]
            vlongname = [longname]

            for _ in range(1, nvar):
                dmp = f.readline()
                shortname, units, standardname, longname = extractVardesc(dmp)
                vshortname += [shortname]
                vunits += [units]
                vstandardname += [standardname]
                vlongname += [longname]

            return { # TODO: refactor dict comp for readability?
                shortname: Variable(
                    shortname,
                    unit,
                    standardname,
                    longname,
                    scale=scale,
                    miss=miss,
                    vartype=vtype,
                )
                for shortname, unit, standardname, longname, scale, miss in zip(
                    vshortname, vunits, vstandardname, vlongname, vscale, vmiss
                )
            }

        self.dependentVariables = readVars(f, VariableType.DependentVariable)

        if self.format == Formats.FFI2110:
            self.auxiliaryVariables = readVars(f, VariableType.AuxiliaryVariable)

        # line 14 + nvar - Number of SPECIAL comment lines (Integer value
        # indicating the number of lines of special comments, NOT including this
        # line.).
        nscom = int(f.readline()[0])

        # line 15 + nvar - Special comments (Notes of problems or special
        # circumstances unique to this file. An example would be comments/problems
        # associated with a particular flight.).
        self.specialComments = [f.readline(doSplit=False) for i in range(0, nscom)]

        # line 16 + nvar + nscom - Number of Normal comments (i.e., number of
        # additional lines of SUPPORTING information: Integer value indicating the
        # number of lines of additional information, NOT including this line.).
        nncom = int(f.readline()[0])

        # line 17 + nvar + nscom - Normal comments (SUPPORTING information: This is
        # the place for investigators to more completely describe the data and
        # measurement parameters. The supporting information structure is described
        # below as a list of key word: value pairs. Specifically include here
        # information on the platform used, the geo-location of data, measurement
        # technique, and data revision comments. Note the non-optional information
        # regarding uncertainty, the upper limit of detection (ULOD) and the lower
        # limit of detection (LLOD) for each measured variable. The ULOD and LLOD
        # are the values, in the same units as the measurements that correspond to
        # the flags -7777's and -8888's within the data, respectively. The last line
        # of this section should contain all the "short" variable names on one line.
        # The key words in this section are written in BOLD below and must appear in
        # this section of the header along with the relevant data listed after the
        # colon. For key words where information is not needed or applicable, simply
        # enter N/A.).
        rawNcom = [f.readline(doSplit=False) for i in range(0, nncom)]
        self.normalComments.ingest(rawNcom)

        self.nHeaderFile = f.line

        if self.nHeader != nHeaderSuggested:
            warnings.warn(
                "Number of header lines suggested in line 1 ({:d}) do not match actual header lines read ({:d})".format(
                    nHeaderSuggested, self.nHeader
                )
            )

    def readData(self, splitChar=","):
        """Read ICARTT data (from file)"""
        if self.inputFhandle.closed:
            self.inputFhandle = open(self.inputFhandle.name, encoding='utf-8')

        # TODO: refactor following try/except statement
        try:
            nul = [self.inputFhandle.readline() for i in range(self.nHeaderFile)]

            raw = [line.split(splitChar) for line in self.inputFhandle]

            nul = self.data.addBulkFromTxt(raw)
        except:
            pass
        finally:
            self.inputFhandle.close()

    def read(self, splitChar=","):
        """Read ICARTT data and header"""
        self.readHeader(splitChar)
        self.endDefineMode(splitChar) # TODO: endDefineMode does not take arg splitChar
        self.readData(splitChar)

    def makeFileName(self, dateFormat="%Y%m%d"):
        """Create ICARTT-compliant file name based on the information contained in the dataset

        :param dateFormat: date format to use when parsing
        :type dateFormat: str, defaults to '%Y%m%d'

        :return: file name generated
        :rtype: string
        """
        fn = (
            self.dataID
            + "_"
            + self.locationID
            + "_"
            + datetime.datetime.strftime(self.dateOfCollection, dateFormat)
        )
        fn += "_R" + str(self.revision) if not self.revision is None else ""
        fn += "_L" + str(self.launch) if not self.launch is None else ""
        fn += (
            "_V" + str(self.fileVolumeNumber)
            if self.totalNumberOfFileVolumes > 1
            else ""
        )

        return fn + ".ict"

    def isValidFileName(self, name):
        # ICARTT standard v2 2.1.1 3)
        # Filename: Uppercase and lowercase ASCII alphanumeric
        # characters (i.e. A-Z, a-z, 0-9), underscore, period,
        # and hyphen. File names can be a maximum 127
        # characters in length.
        def isAsciiAlpha(x):
            return re.match("[a-zA-Z0-9-_.]", x)

        allAsciiAlpha = all([isAsciiAlpha(x) for x in name])
        lessThan128Characters = len(name) < 128

        return allAsciiAlpha and lessThan128Characters

    def writeHeader(self, f=sys.stdout, delimiter=DEFAULT_FIELD_DELIM):
        """Write header

        :param f: handle to write to
        :type f: file handle or StringIO stream, defaults to sys.stdout
        """

        def prnt(txt): # TODO: rename? it writes to file, so "write_to_file"?
            f.write(str(txt) + "\n")

        # Number of lines in header, file format index (most files use 1001) - comma delimited.
        versInfo = [self.nHeader, self.format.value]
        if self.version is not None:
            versInfo.append(self.version)
        txt = delimiter.join([str(x) for x in versInfo])

        prnt(txt)
        # PI last name, first name/initial.
        prnt(self.PIName)
        # Organization/affiliation of PI.
        prnt(self.PIAffiliation)
        # Data source description (e.g., instrument name, platform name, model name, etc.).
        prnt(self.dataSourceDescription)
        # Mission name (usually the mission acronym).
        prnt(self.missionName)
        # File volume number, number of file volumes (these integer values are used when the data require more than one file per day; for data that require only one file these values are set to 1, 1) - comma delimited.
        prnt(
            delimiter.join(
                [str(self.fileVolumeNumber), str(self.totalNumberOfFileVolumes)]
            )
        )
        # UTC date when data begin, UTC date of data reduction or revision - comma delimited (yyyy, mm, dd, yyyy, mm, dd).
        prnt(
            delimiter.join(
                [
                    datetime.datetime.strftime(x, delimiter.join(["%Y", "%m", "%d"]))
                    for x in [self.dateOfCollection, self.dateOfRevision]
                ]
            )
        )
        # Data Interval (This value describes the time spacing (in seconds) between consecutive data records. It is the (constant) interval between values of the independent variable. For 1 Hz data the data interval value is 1 and for 10 Hz data the value is 0.1. All intervals longer than 1 second must be reported as Start and Stop times, and the Data Interval value is set to 0. The Mid-point time is required when it is not at the average of Start and Stop times. For additional information see Section 2.5 below.).
        prnt(delimiter.join([str(x) for x in self.dataIntervalCode]))
        if self.format == Formats.FFI2110:
            # Description or name of independent (bound) variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.).
            prnt(self.independentBoundedVariable.desc(delimiter))
        # Description or name of independent variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.).
        prnt(self.independentVariable.desc(delimiter))
        # Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.).
        prnt(len(self.dependentVariables))
        # Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited.
        prnt(
            delimiter.join(
                [str(DVAR.scale) for DVAR in self.dependentVariables.values()]
            )
        )
        # Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited.
        prnt(
            delimiter.join(
                [str(DVAR.miss) for DVAR in self.dependentVariables.values()]
            )
        )
        # Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.).
        _ = [prnt(DVAR.desc(delimiter)) for DVAR in self.dependentVariables.values()]
        if self.format == Formats.FFI2110:
            # Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.).
            prnt(len(self.auxiliaryVariables))
            # Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited.
            prnt(
                delimiter.join(
                    [str(AUXVAR.scale) for AUXVAR in self.auxiliaryVariables.values()]
                )
            )
            # Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited.
            prnt(
                delimiter.join(
                    [str(AUXVAR.miss) for AUXVAR in self.auxiliaryVariables.values()]
                )
            )
            # Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.).
            _ = [
                prnt(AUXVAR.desc(delimiter))
                for AUXVAR in self.auxiliaryVariables.values()
            ]

        # Number of SPECIAL comment lines (Integer value indicating the number of lines of special comments, NOT including this line.).
        prnt("{:d}".format(len(self.specialComments)))
        # Special comments (Notes of problems or special circumstances unique to this file. An example would be comments/problems associated with a particular flight.).
        _ = [prnt(x) for x in self.specialComments]
        # Number of Normal comments (i.e., number of additional lines of SUPPORTING information: Integer value indicating the number of lines of additional information, NOT including this line.).
        prnt("{:d}".format(self.normalComments.nlines))
        # Normal comments (SUPPORTING information: This is the place for investigators to more completely describe the data and measurement parameters. The supporting information structure is described below as a list of key word: value pairs. Specifically include here information on the platform used, the geo-location of data, measurement technique, and data revision comments. Note the non-optional information regarding uncertainty, the upper limit of detection (ULOD) and the lower limit of detection (LLOD) for each measured variable. The ULOD and LLOD are the values, in the same units as the measurements that correspond to the flags -7777s and -8888s within the data, respectively. The last line of this section should contain all the short variable names on one line. The key words in this section are written in BOLD below and must appear in this section of the header along with the relevant data listed after the colon. For key words where information is not needed or applicable, simply enter N/A.).
        # re-create last line out of actual data if missing...
        if self.normalComments.shortnames == []:
            self.normalComments.shortnames = delimiter.join(
                [self.variables[x].shortname for x in self.variables]
            )
        _ = [prnt(x) for x in self.normalComments]

    def writeData(
        self, f=sys.stdout, fmt=DEFAULT_NUM_FORMAT, delimiter=DEFAULT_FIELD_DELIM
    ):
        """Write data

        :param f: handle to write to
        :type f: file handle or StringIO stream, defaults to sys.stdout
        """
        self.data.write(f=f, fmt=fmt, delimiter=delimiter)

    def write(
        self, f=sys.stdout, fmt=DEFAULT_NUM_FORMAT, delimiter=DEFAULT_FIELD_DELIM
    ):
        """Write header and data

        :param f: handle to write to
        :type f: file handle or StringIO stream, defaults to sys.stdout
        """
        self.writeHeader(f=f, delimiter=delimiter)
        self.writeData(f=f, fmt=fmt, delimiter=delimiter)

    def endDefineMode(self):
        """Fixes the variables structure of the dataset. Sets up the data store,
        so data can be added. Needs to be called after variable definition
        and before adding data.
        """
        self.defineMode = False

        # create data store
        if self.format == Formats.FFI1001:
            self.data = DataStore1001(self.independentVariable, self.dependentVariables)
        elif self.format == Formats.FFI2110:
            self.data = DataStore2110(
                self.independentVariable,
                self.independentBoundedVariable,
                self.auxiliaryVariables,
                self.dependentVariables,
            )

    def __del__(self):
        # TODO: refactor following try/except statement
        try:
            if not self.inputFhandle.closed:
                self.inputFhandle.close()
        except:
            pass

    def __init__(self, f=None, loadData=True, splitChar=",", format=Formats.FFI1001): # TODO: why is init comming last?
        """Constructor method"""
        self.format = format
        self.version = None

        self.dataID = "dataID"
        self.locationID = "locationID"

        self.revision = 0
        self.launch = None
        self.fileVolumeNumber = 1
        self.totalNumberOfFileVolumes = 1

        self.PIName = "Mustermann, Martin"
        self.PIAffiliation = "Musterinstitut"
        self.dataSourceDescription = "Musterdatenprodukt"
        self.missionName = "MUSTEREX"

        # TODO: not 100%  sure if this is relevant here, but those dates should refer to UTC
        # - if you leave the datetime objects naive (no tzinfo set), they represent local time.
        self.dateOfCollection = datetime.datetime.today()
        self.dateOfRevision = datetime.datetime.today()
        self.dataIntervalCode = [0.0]

        self.independentVariable = None
        self.independentBoundedVariable = None
        self.auxiliaryVariables = {}
        self.dependentVariables = {}

        self.specialComments = []
        self.normalComments = StandardNormalComments()

        # Standard v2.0 for normal comments requires all keywords present,
        # might not be the case - then reading data will fail
        self.nHeaderFile = -1

        self.data = None

        self.defineMode = True

        # read data if f is not None
        if f is not None:
            if isinstance(f, (str, pathlib.Path)):
                self.inputFhandle = open(f, "r", encoding='utf-8')
            else:
                self.inputFhandle = f

        # TODO should we add a filename characters check, as we have it for the variable names?

            self.readHeader(splitChar)
            if loadData:
                self.endDefineMode()
                self.readData(splitChar)