dataset.py

import datetime
import sys
import collections
import re
import math
import string
import warnings

IMPLEMENTED_FORMATS = [ 1001, 2110 ]

class StandardNormalComments(collections.UserList):
    @property
    def data(self):
        return self.freeform + [ k + ": " + str(v) for k, v in self.keywords.items() ] + [ self.shortnames ]
    def ingest(self, raw):
        # last line is always shortname
        self.shortnames = raw.pop()
        
        # rest is either keyword, or free form
        for l in raw:
            is_keyword = False
            for k in self.possible_keywords:
                if l.startswith(k + ":"):
                    self.keywords[k] = l.replace(k + ":", "").strip()
                    is_keyword = True
                    break
            if not is_keyword:
                self.freeform.append(l)
        
    def __init__(self, contents=[]):
        self.freeform = []
        self.possible_keywords = [
            "PI_CONTACT_INFO",
            "PLATFORM",
            "LOCATION",
            "ASSOCIATED_DATA",
            "INSTRUMENT_INFO",
            "DATA_INFO",
            "UNCERTAINTY",
            "ULOD_FLAG",
            "ULOD_VALUE",
            "LLOD_FLAG",
            "LLOD_VALUE",
            "DM_CONTACT_INFO",
            "PROJECT_INFO",
            "STIPULATIONS_ON_USE",
            "OTHER_COMMENTS",
            "REVISION"
            ] + \
            [ "R{:d}".format(x) for x in range(9) ] + \
            [ "R{:s}".format(y) for y in string.ascii_uppercase ]
        self.keywords   = {}
        self.shortnames = []
        
        if not contents is []:
            self.ingest(contents)

class Variable(collections.UserList):
    '''An ICARTT variable description with name, units, scale and missing value.

    :param name: Name of the variable
    :type name: str

    :param units: Units of the variable
    :type units: str

    :param longname: Long name of the variable
    :type longname: str

    :param scale: Scaling factor for the variable
    :type scale: float, defaults to 1.0

    :param miss: Missing value for the variable
    :type miss: float, defaults to -99999.0

    :param splitChar: Split character for text representation
    :type splitChar: str, defaults to ","
    '''
    @property
    def desc(self):
        '''Variable description string as it appears in an ICARTT file

        :return: description string
        :rtype: str
        '''
        return self.splitChar.join( [ str(self.name), str(self.units), str(self.longname) ] )
    
    def append(self, *argv):
        '''Append data to a variable. Depending on type (independent, dependent variable),
        all identifying (bounded and unbounded) independent variables need to be given.
        
        :param ivar: value of the independent (unbounded) variable
        :type ivar: float

        :param ibvar: value of the independent (bounded) variable
        :type ibvar: float, optional

        :param dvar: value of the dependent variable
        :type dvar: float, optional
        '''
        sanitized = lambda z: float(z) if not float(z) == float(self.miss) else float('NaN')
        
        v = [ sanitized(y) for y in argv ]
        
        if len(v) > 2:
            # ( (a, b, ...), c )
            x = ( tuple( [ y for y in v[:-1] ] ), v[-1] )
        elif len(v) > 1:
            # ( a, b )
            x = ( ( v[0] ), v[1] )
        else:
            # ( a )
            x = ( v[0] )
        
        self.data.append( x )
    
    def __init__(self, name, units, longname, scale=1.0, miss=-99999.0, splitChar=","):
        '''Constructor method
        '''
        self.name           = name
        self.units          = units
        self.longname       = longname
        self.scale          = scale
        self.miss           = miss
        
        self.splitChar      = splitChar
        
        self.data           = []

class Dataset:
    '''An ICARTT dataset that can be created from scratch or read from a file,
    manipulated, and then written to a file.

    :param f: file path or file handle to use
    :type f: str or file handle or stream object, defaults to None

    :param loadData: load data as well (or only header if False)?
    :type loadData: bool, defaults to "True"

    :param splitChar: splitting character used to separate fields in a line
    :type splitChar: str, defaults to ","
    '''
    @property
    def nheader(self):
        '''Header line count

        :return: line count
        :rtype: int
        '''
        total = -1
        if self.format == 1001:
            total = 14 + len(self.DVARS) + len(self.SCOM) + len(self.NCOM)
        if self.format == 2110:
            # 2: IVAR + IBVAR
            total = 16 + 2 + len(self.AUXVARS) + len(self.DVARS) + len(self.SCOM) + len(self.NCOM)
        return total

    @property
    def varnames(self):
        '''Names of variables (independent and dependent)

        :return: list of variable names
        :rtype: list
        '''
        return [ x for x in self.VARS.keys() ]

    @property
    def times(self):
        '''Time steps of the data

        :return: list of time steps
        :rtype: list
        '''
        return [ self.dateValid + datetime.timedelta(seconds=x) for x in self.IVAR ]

    @property
    def VARS(self):
        '''Variables (independent + dependent + auxiliary)

        :return: dictionary of all variables
        :rtype: dict of Variable(s)
        '''
        vars = { self.IVAR.name: self.IVAR, **self.DVARS }
        if self.format == 2110:
            vars = { self.IBVAR.name: self.IBVAR, **vars, **self.AUXVARS }
        return vars
    
    def __getitem__(self, name):
        '''Shortcut to enable access to variable data by name

        :return: variable data
        :rtype: list
        '''
        return self.VARS[name]
    
    def write_header(self, f=sys.stdout):
        '''Write header

        :param f: handle to write to
        :type f: file handle or StringIO stream, defaults to sys.stdout
        '''
        def prnt(txt):
            f.write(str(txt) + "\n")

        # Number of lines in header, file format index (most files use 1001) - comma delimited.
        txt = "{:d}, {:d}".format(self.nheader, self.format)
        if self.version is not None:
            txt = "{:d}, {:d}, {:s}".format(self.nheader, self.format, self.version) 
        prnt(txt)
        # PI last name, first name/initial.
        prnt(self.PI)
        # Organization/affiliation of PI.
        prnt(self.organization)
        # Data source description (e.g., instrument name, platform name, model name, etc.).
        prnt(self.dataSource)
        # Mission name (usually the mission acronym).
        prnt(self.mission)
        # File volume number, number of file volumes (these integer values are used when the data require more than one file per day; for data that require only one file these values are set to 1, 1) - comma delimited.
        prnt(self.splitChar.join([ str(self.volume), str(self.nvolumes) ]))
        # UTC date when data begin, UTC date of data reduction or revision - comma delimited (yyyy, mm, dd, yyyy, mm, dd).
        prnt(self.splitChar.join([ datetime.datetime.strftime(x, self.splitChar.join(["%Y","%m","%d"])) for x in [ self.dateValid, self.dateRevised ] ]))
        # Data Interval (This value describes the time spacing (in seconds) between consecutive data records. It is the (constant) interval between values of the independent variable. For 1 Hz data the data interval value is 1 and for 10 Hz data the value is 0.1. All intervals longer than 1 second must be reported as Start and Stop times, and the Data Interval value is set to 0. The Mid-point time is required when it is not at the average of Start and Stop times. For additional information see Section 2.5 below.).
        prnt(self.dataInterval)
        if self.format == 2110:
            # Description or name of independent (bound) variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.).
            prnt(self.IBVAR.desc)
        # Description or name of independent variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.).
        prnt(self.IVAR.desc)
        # Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.).
        prnt(len(self.DVARS))
        # Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited.
        prnt(self.splitChar.join( [ "{:.1g}".format(DVAR.scale) for DVAR in self.DVARS.values() ]))
        # Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited.
        prnt(self.splitChar.join( [ str(DVAR.miss) for DVAR in self.DVARS.values() ]))
        # Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.).
        nul = [ prnt(DVAR.desc) for DVAR in self.DVARS.values() ]
        if self.format == 2110:
            # Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.).
            prnt(len(self.AUXVARS))
            # Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited.
            prnt(self.splitChar.join( [ "{:.1g}".format(AUXVAR.scale) for AUXVAR in self.AUXVARS.values() ]))
            # Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited.
            prnt(self.splitChar.join( [ str(AUXVAR.miss) for AUXVAR in self.AUXVARS.values() ]))
            # Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.).
            nul = [ prnt(AUXVAR.desc) for AUXVAR in self.AUXVARS.values() ]

        # Number of SPECIAL comment lines (Integer value indicating the number of lines of special comments, NOT including this line.).
        prnt("{:d}".format(len(self.SCOM)))
        # Special comments (Notes of problems or special circumstances unique to this file. An example would be comments/problems associated with a particular flight.).
        nul = [ prnt(x) for x in self.SCOM ]
        # Number of Normal comments (i.e., number of additional lines of SUPPORTING information: Integer value indicating the number of lines of additional information, NOT including this line.).
        prnt("{:d}".format(len(self.NCOM)))
        # Normal comments (SUPPORTING information: This is the place for investigators to more completely describe the data and measurement parameters. The supporting information structure is described below as a list of key word: value pairs. Specifically include here information on the platform used, the geo-location of data, measurement technique, and data revision comments. Note the non-optional information regarding uncertainty, the upper limit of detection (ULOD) and the lower limit of detection (LLOD) for each measured variable. The ULOD and LLOD are the values, in the same units as the measurements that correspond to the flags -7777s and -8888s within the data, respectively. The last line of this section should contain all the short variable names on one line. The key words in this section are written in BOLD below and must appear in this section of the header along with the relevant data listed after the colon. For key words where information is not needed or applicable, simply enter N/A.).
        nul = [ prnt(x) for x in self.NCOM ]
    
    def _write_data_1001(self, prnt=lambda x: sys.stdout.write(x)):
        def p(val, var):
            return var.miss if math.isnan(val) else val

        for i in range(len(self.IVAR)):
            prnt( [ p(self.IVAR[i],self.IVAR) ] + [ p(DVAR[i][1],DVAR) for DVAR in self.DVARS.values() ] )

    def _write_data_2110(self, prnt=lambda x: sys.stdout.write(x)):
        def p(val, var):
            return var.miss if math.isnan(val) else val

        for ival in self.IVAR:
            prnt( [ p(ival, self.IVAR) ] + [ p(auxval[1], AUXVAR) for AUXVAR in self.AUXVARS.values() for auxval in AUXVAR if auxval[0] == ival ] )
            for ibval in [ b[1] for b in self.IBVAR if b[0] == ival ]:
                prnt([ p(ibval, self.IBVAR) ] + [ p(dval[1], DVAR) for DVAR in self.DVARS.values() for dval in DVAR if (dval[0][0] == ival) and (dval[0][1] == ibval) ])
    
    def write_data(self, f=sys.stdout):
        '''Write data

        :param f: handle to write to
        :type f: file handle or StringIO stream, defaults to sys.stdout
        '''
        def prnt_data(vars):
            f.write( str(self.splitChar.join([ str(x) for x in vars ])) + "\n")
        
        if self.format == 1001:
            nul = self._write_data_1001(prnt=prnt_data)
        elif self.format == 2110:
            nul = self._write_data_2110(prnt=prnt_data)
        else:
            warnings.warn("Unknown file format {:d}".format(self.format))
    
    def write(self, f=sys.stdout):
        '''Write header and data

        :param f: handle to write to
        :type f: file handle or StringIO stream, defaults to sys.stdout
        '''
        self.write_header(f=f)
        self.write_data(f=f)
    
    def make_filename(self, date_format='%Y%m%d'):
        '''Create ICARTT-compliant file name based on the information contained in the dataset

        :param date_format: date format to use when parsing
        :type date_format: str, defaults to '%Y%m%d'

        :return: file name generated
        :rtype: string
        '''
        fn  = self.dataID + "_" +self.locationID + "_" +datetime.datetime.strftime(self.dateValid, date_format)
        fn += "_R" + str(self.revision) if not self.revision is None else ""
        fn += "_L" + str(self.launch)   if not self.launch is None   else ""
        fn += "_V" + str(self.volume)   if self.nvolumes > 1         else ""
        
        return fn + ".ict"

    def read_header(self):
        '''Read the ICARTT header (from file)
        '''        
        class Filehandle_with_linecounter:
            def __init__(self, f, splitChar):
                self.f         = f
                self.line      = 0
                self.splitChar = splitChar
            def readline(self, do_split=True):
                self.line += 1
                dmp = self.f.readline().replace('\n', '').replace('\r','')
                if do_split:
                    dmp = [ word.strip(' ') for word in dmp.split(self.splitChar) ]
                return dmp
        
        if self.input_fhandle.closed:
            self.input_fhandle = open(self.input_fhandle.name)
        
        f = Filehandle_with_linecounter(self.input_fhandle, self.splitChar)
        
        # line 1 - Number of lines in header, file format index (most files use
        # 1001) - comma delimited.
        dmp = f.readline()
        
        nheader_suggested = int(dmp[0])
        self.format = int(dmp[1])
        if len(dmp) > 2:
            self.version = dmp[2]
        
        if not self.format in IMPLEMENTED_FORMATS:
            raise ValueError("ICARTT format {:d} not implemented".format(self.format))
        
        # line 2 - PI last name, first name/initial.
        self.PI = f.readline(do_split=False)
        
        # line 3 - Organization/affiliation of PI.
        self.organization = f.readline(do_split=False)
        
        # line 4 - Data source description (e.g., instrument name, platform name,
        # model name, etc.).
        self.dataSource = f.readline(do_split=False)
        
        # line 5 - Mission name (usually the mission acronym).
        self.mission = f.readline(do_split=False)

        # line 6 - File volume number, number of file volumes (these integer values
        # are used when the data require more than one file per day; for data that
        # require only one file these values are set to 1, 1) - comma delimited.
        dmp = f.readline()
        self.volume   = int(dmp[0])
        self.nvolumes = int(dmp[1])
        
        # line 7 - UTC date when data begin, UTC date of data reduction or revision
        # - comma delimited (yyyy, mm, dd, yyyy, mm, dd).
        dmp = f.readline()
        self.dateValid   = datetime.datetime.strptime("".join([ "{:s}".format(x) for x in dmp[0:3] ]), '%Y%m%d')
        self.dateRevised = datetime.datetime.strptime("".join([ "{:s}".format(x) for x in dmp[3:6] ]), '%Y%m%d')

        # line 8 - Data Interval (This value describes the time spacing (in seconds)
        # between consecutive data records. It is the (constant) interval between
        # values of the independent variable. For 1 Hz data the data interval value
        # is 1 and for 10 Hz data the value is 0.1. All intervals longer than 1
        # second must be reported as Start and Stop times, and the Data Interval
        # value is set to 0. The Mid-point time is required when it is not at the
        # average of Start and Stop times. For additional information see Section
        # 2.5 below.).
        self.dataInterval = float(f.readline()[0])
        
        # line 9 - Description or name of independent variable (This is the name
        # chosen for the start time. It always refers to the number of seconds UTC
        # from the start of the day on which measurements began. It should be noted
        # here that the independent variable should monotonically increase even when
        # crossing over to a second day.
        if self.format == 2110:
            dmp = f.readline()
            self.IBVAR = Variable(dmp[0], dmp[1], dmp[2 if len(dmp) > 2 else 1], splitChar=self.splitChar)
        
        dmp = f.readline()
        self.IVAR = Variable(dmp[0], dmp[1], dmp[2 if len(dmp) > 2 else 1], splitChar=self.splitChar)
        
        def read_vars(f):
            # line 10 - Number of variables (Integer value showing the number of
            # dependent variables: the total number of columns of data is this value
            # plus one.).
            nvar = int(f.readline()[0])
    
            # line 11- Scale factors (1 for most cases, except where grossly
            # inconvenient) - comma delimited.
            vscale = [ float(x) for x in f.readline() ]
    
            # line 12 - Missing data indicators (This is -9999 (or -99999, etc.) for
            # any missing data condition, except for the main time (independent)
            # variable which is never missing) - comma delimited.
            vmiss = [ float(x) for x in f.readline() ]
            # no float casting here, as we need to do string comparison lateron when reading data...
    
            # line 13 - Variable names and units (Short variable name and units are
            # required, and optional long descriptive name, in that order, and separated
            # by commas. If the variable is unitless, enter the keyword "none" for its
            # units. Each short variable name and units (and optional long name) are
            # entered on one line. The short variable name must correspond exactly to
            # the name used for that variable as a column header, i.e., the last header
            # line prior to start of data.).
            dmp = f.readline()
            vname     = [ dmp[0] ]
            vunits    = [ dmp[1] ]
            vlongname = [ dmp[2 if len(dmp) > 2 else 1] ]
    
            for i in range(1, nvar):
                dmp = f.readline()
                vname     += [ dmp[0] ]
                vunits    += [ dmp[1] ]
                vlongname += [ dmp[2 if len(dmp) > 2 else 1] ]
            
            return { name: Variable(name, unit, longname, scale=scale, miss=miss, splitChar=self.splitChar) for name, unit, longname, scale, miss in zip(vname, vunits, vlongname, vscale, vmiss) }

        self.DVARS = read_vars(f)

        if self.format == 2110:
            self.AUXVARS = read_vars(f)

        # line 14 + nvar - Number of SPECIAL comment lines (Integer value
        # indicating the number of lines of special comments, NOT including this
        # line.).
        nscom = int(f.readline()[0])

        # line 15 + nvar - Special comments (Notes of problems or special
        # circumstances unique to this file. An example would be comments/problems
        # associated with a particular flight.).
        self.SCOM          = [ f.readline(do_split=False) for i in range(0, nscom) ]

        # line 16 + nvar + nscom - Number of Normal comments (i.e., number of
        # additional lines of SUPPORTING information: Integer value indicating the
        # number of lines of additional information, NOT including this line.).
        nncom = int(f.readline()[0])

        # line 17 + nvar + nscom - Normal comments (SUPPORTING information: This is
        # the place for investigators to more completely describe the data and
        # measurement parameters. The supporting information structure is described
        # below as a list of key word: value pairs. Specifically include here
        # information on the platform used, the geo-location of data, measurement
        # technique, and data revision comments. Note the non-optional information
        # regarding uncertainty, the upper limit of detection (ULOD) and the lower
        # limit of detection (LLOD) for each measured variable. The ULOD and LLOD
        # are the values, in the same units as the measurements that correspond to
        # the flags -7777's and -8888's within the data, respectively. The last line
        # of this section should contain all the "short" variable names on one line.
        # The key words in this section are written in BOLD below and must appear in
        # this section of the header along with the relevant data listed after the
        # colon. For key words where information is not needed or applicable, simply
        # enter N/A.).
        raw_ncom = [ f.readline(do_split=False) for i in range(0, nncom) ]
        self.NCOM         = StandardNormalComments(raw_ncom)
            
        self.nheader_file = f.line
        
        del f
        self.input_fhandle.close()
        
        if self.nheader != nheader_suggested:
            warnings.warn("Number of header lines suggested in line 1 ({:d}) do not match actual header lines read ({:d})".format(nheader_suggested, self.nheader))

    def _extract_items_1001(self, raw):
        for cur in range(len(raw)):
            self.IVAR.append(raw[cur][0])
            nul = [ self.DVARS[key].append(raw[cur][0], raw[cur][i+1]) for i, key in enumerate(self.DVARS) ]

    def _extract_items_2110(self, raw):
        cur = 0
        num_var_name = list(self.AUXVARS.keys())[0]
        while cur < len(raw):
            self.IVAR.append(raw[cur][0])
            nul = [ self.AUXVARS[key].append(raw[cur][0], raw[cur][i+1]) for i, key in enumerate(self.AUXVARS) ]
            nprimary = int(self.AUXVARS[num_var_name][-1][1])
            for i in range(nprimary):
                self.IBVAR.append(raw[cur][0], raw[cur+i+1][0])
                nul = [ self.DVARS[key].append(raw[cur][0], raw[cur+i+1][0], raw[cur+i+1][j+1]) for j, key in enumerate(self.DVARS) ]
            cur += 1 + nprimary
    
    def read_data(self):
        '''Read ICARTT data (from file)
        '''
        if self.input_fhandle.closed:
            self.input_fhandle = open(self.input_fhandle.name)

        nul = [ self.input_fhandle.readline() for i in range(self.nheader_file) ]

        raw = [ line.split(self.splitChar) for line in self.input_fhandle ]
        if self.format == 1001:
            nul = self._extract_items_1001(raw)
        elif self.format == 2110:
            nul = self._extract_items_1001(raw)
        else:
            warnings.warn("Unknown file format: {:d}, could not read data.".format(self.format))
        
        self.input_fhandle.close()

    def read(self):
        '''Read ICARTT data and header
        '''
        self.read_header()
        self.read_data()
        
    def __del__(self):
        try:
            if not self.input_fhandle.closed:
                self.input_fhandle.close()
        except:
            pass
        
    def __init__(self, f=None, loadData=True, splitChar=","):
        '''Constructor method
        '''
        self.format       = 1001
        self.version      = None

        self.dataID       = 'dataID'
        self.locationID   = 'locationID'

        self.revision     = 0
        self.launch       = None
        self.volume       = 1
        self.nvolumes     = 1

        self.PI           = 'Mustermann, Martin'
        self.organization = 'Musterinstitut'
        self.dataSource   = 'Musterdatenprodukt'
        self.mission      = 'MUSTEREX'
        self.dateValid    = datetime.datetime.today()
        self.dateRevised  = datetime.datetime.today()
        self.dataInterval = 0.0
        self.IVAR         = Variable('Time_Start',
                                     'seconds_from_0_hours_on_valid_date',
                                     'seconds_from_0_hours_on_valid_date',
                                     scale=1.0, miss=-9999999, splitChar=splitChar)
        self.IBVAR        = None
        self.AUXVARS      = {}
        self.DVARS        = {
                            'Time_Stop':
                            Variable('Time_Stop',
                                     'seconds_from_0_hours_on_valid_date',
                                     'seconds_from_0_hours_on_valid_date',
                                     scale=1.0, miss=-9999999, splitChar=splitChar),
                            'Some_Variable':
                            Variable('Some_Variable',
                                     'ppbv',
                                     'ppbv',
                                     scale=1.0, miss=-9999999, splitChar=splitChar)
                            }

        self.SCOM         = []
        self.NCOM         = []
        
        self.splitChar    = splitChar
        
        # Standard v2.0 for normal comments requires all keywords present,
        # might not be the case - then reading data will fail
        self.nheader_file = -1
        
        # read data if f is not None
        if f is not None:
            if isinstance(f, str):
                self.input_fhandle = open(f, 'r')
            else:
                self.input_fhandle = f
            
            self.read_header()
            if loadData:
                self.read_data()


'''
        - file type 1001, add value of independent variable:
        ivar.append(234.4)

        - file type 1001, add value of dependent variable:
        ivar.append(234.4, 18.2)

        
        - file type 2110, add value of independent (unbounded) variable:
        ivar.append(234.4)

        - file type 2110, add value of independent (bounded) variable:
        ivar.append(234.4, 9148.2)

        - file type 2110, add value of dependent variable:
        ivar.append(234.4, 9148.2, 34.2)
'''