Skip to content
dataset.py 36 KiB
Newer Older
Christoph.Knote's avatar
sdf
Christoph.Knote committed
import datetime
import sys
Christoph Knote's avatar
Christoph Knote committed
import collections
Christoph Knote's avatar
Christoph Knote committed
import re
Christoph Knote's avatar
Christoph Knote committed
import math
Christoph Knote's avatar
Christoph Knote committed
import warnings
import numpy as np
from io import StringIO
Christoph.Knote's avatar
sdf
Christoph.Knote committed

from enum import Enum, IntEnum

class Formats(IntEnum):
    '''File Format Indicators (FFI)
    '''
    FFI_1001                        = 1001
    FFI_2110                        = 2110
#    FFI_2310                        = 2310

class VariableType(Enum):
    Independent_Variable            = 1
    Independent_Bounded_Variable    = 2
    Dependent_Variable              = 3

def sanitize(val, miss):
    return float(val) if not float(val) == float(miss) else np.NaN

class DataStore_1001:
    def __init__(self, ivar, dvars):
        self.ivarname   = ivar.shortname
#
        self.varnames   = [ ivar.shortname ] + [ x for x in dvars ]
        self.missvals   = { x: dvars[x].miss for x in dvars }
        self.missvals.update( { self.ivarname: ivar.miss } )
# 
        self.data       = None
#
    def __getitem__(self, s=slice(None)):
        return self.data[s]
#
    def add_bulk(self, raw):
        for cur in range(len(raw)):
            newdata = { x: raw[cur][i] for i, x in enumerate( self.varnames ) }
            self.add(**newdata)
#
    def add(self, **kwargs):
        if not self.ivarname in kwargs.keys():
            raise Exception("Need independent variable data.")

        ivarvalue = sanitize(kwargs[self.ivarname], self.missvals[self.ivarname])

        newline = np.array( np.NaN, dtype = [ (v, 'f8') for v in self.varnames ] )
        for key in kwargs.keys():
            if key in self.varnames:
                newline[key] = sanitize( kwargs[key], self.missvals[key] )

        if type(self.data) is type(None):
            self.data = newline
            self.data = self.data.reshape(1) # don't even ask
        else:
            if ivarvalue in self.data[self.ivarname]:
                raise Exception("Cannot replace data (yet).")
            else:
                self.data = np.append( self.data, newline )
    
    def write(self, f=sys.stdout, fmt="%.1f", delimiter=", "):
        np.savetxt( f, self.data, fmt=fmt, delimiter=delimiter )


class DataStore_2110:
    def __init__(self, ivar, ibvar, auxvars, dvars):
        self.ivarname       = ivar.shortname
        self.ibvarname      = ibvar.shortname

        self.auxvarnames    = [ x for x in auxvars ]
        self.dvarnames      = [ x for x in dvars   ]

        self.missvals       = { x: dvars[x].miss for x in dvars }
        self.missvals.update( { x: auxvars[x].miss for x in auxvars } )
        self.missvals.update( { self.ibvarname: ibvar.miss } )
        self.missvals.update( { self.ivarname: ivar.miss } )

        self.nauxvarname    = self.auxvarnames[0] # convention!
#
        self.data           = {}
#
        self.ivar           = ivar
        self.auxvars        = auxvars
        self.ibvar          = ibvar
        self.dvars          = dvars
#
    def add_auxline(self, auxline):
        newdata = { x: auxline[i] for i, x in enumerate( [ self.ivarname ] + self.auxvarnames ) }
        self.add(**newdata)

    def add_deplines(self, ivar, raw):
        for cur in range(len(raw)):
            newdata = { x: raw[cur][i] for i, x in enumerate( [ self.ibvarname ] + self.dvarnames ) }
            newdata.update( { self.ivarname: ivar })
            self.add(**newdata)

    def add_bulk(self, raw):
        cur = 0
        while cur < len(raw):
            ivarvalue = sanitize(raw[cur][0], self.missvals[self.ivarname])

            self.add_auxline(raw[cur])
            cur += 1

            # stupid, but at first auxline added, nprimary_dat ist a 0-dim array...
            nprimary_data = self.data[ivarvalue]['AUX'][self.nauxvarname]
            nprimary = int(nprimary_data) if nprimary_data.shape == () else int(nprimary_data[-1])

            self.add_deplines(ivarvalue, raw[ cur:(cur+nprimary) ])

            cur += nprimary
    
    def add(self, **kwargs):
        # whatever we do, an independent variable is needed
        if not self.ivarname in kwargs.keys():
            raise Exception("Need independent variable data.")
        
        ivarvalue = sanitize(kwargs[self.ivarname], self.missvals[self.ivarname])

        # this is an AUX line
        if any( [ x in self.auxvarnames for x in kwargs.keys() ] ):
            # and we create the whole dataset if needed
            if not ivarvalue in self.data.keys():
                self.data[ ivarvalue ] = {
                    "AUX": DataStore_1001( self.ivar, self.auxvars ),
                    "DEP": DataStore_1001( self.ibvar, self.dvars )
                    }
            self.data[ ivarvalue ]['AUX'].add( **kwargs )

        # this is a DEP line
        if any( [ x in self.dvarnames for x in kwargs.keys() ] ):
            if not self.ibvarname in kwargs.keys():
                raise Exception("Need independent (bounded) variable data.")
            
            if not ivarvalue in self.data.keys():
                raise Exception("Aux data line needs to be added first.")
            
            self.data[ ivarvalue ]['DEP'].add( **kwargs )

    def write(self, f=sys.stdout, fmt="%.1f", delimiter=", "):
        for ivarvalue in self.data:
            self.data[ ivarvalue ]['AUX'].write(f, fmt=fmt, delimiter=delimiter)
            self.data[ ivarvalue ]['DEP'].write(f, fmt=fmt, delimiter=delimiter)

class KeywordComment():
    def __init__(self, key, na_allowed):
        self.key        = key
        self.na_allowed = na_allowed
        self.data       = []
    def append(self, data):
        self.data.append(data)
    def __str__(self):
        d = "\n".join(self.data) if not self.data is [] else "N/A"
        return self.key + ": " + d

Christoph Knote's avatar
Christoph Knote committed

Christoph Knote's avatar
Christoph Knote committed
class StandardNormalComments(collections.UserList):
Christoph Knote's avatar
Christoph Knote committed
    @property
    def nlines(self):
        # "+ 1" -> shortnames line, and keywords might be multiline...
        return len(self.freeform) + 1 + sum([ len(k.data) for k in self.keywords.values() ])

Christoph Knote's avatar
Christoph Knote committed
    @property
    def data(self):
        return self.freeform + [ str(s) for s in self.keywords.values()  ] + [ self.shortnames ]

Christoph Knote's avatar
Christoph Knote committed
    def ingest(self, raw):
        # last line is always shortname
        self.shortnames = raw.pop()

        # per standard: The free-form text section consists of the lines 
        # between the beginning of the normal comments section 
        # and the first required keyword. [...] The required “KEYWORD: value” pairs block 
        # starts with the line that begins with the first required keyword 
        # and must include all required “KEYWORD: value” pairs 
        # in the order listed in the ICARTT documentation. 

        current_keyword = None
Christoph Knote's avatar
Christoph Knote committed
        for l in raw:
Christoph Knote's avatar
Christoph Knote committed
            possible_keyword = l.split(":")[0].strip()
            if possible_keyword in self.keywords or re.match("R[a-zA-Z0-9]{1,2}[ ]*", possible_keyword):
                current_keyword = possible_keyword
                if not current_keyword in self.keywords: # for the revisions only...
                    self.keywords[current_keyword] = KeywordComment(current_keyword, False)
            
            if current_keyword is None:
Christoph Knote's avatar
Christoph Knote committed
                self.freeform.append(l)
Christoph Knote's avatar
Christoph Knote committed
                self.keywords[current_keyword].append( l.replace(l.split(":")[0] + ":", "").strip() )
        
        for key in self.keywords:
            if self.keywords[key].data == []:
                warnings.warn("Normal comments: required keyword {:s} is missing.".format(key))
    def __init__(self):
Christoph Knote's avatar
Christoph Knote committed
        self.shortnames = []

        required_keywords = (
            "PI_CONTACT_INFO", 
            "PLATFORM", 
            "LOCATION", 
            "ASSOCIATED_DATA", 
            "INSTRUMENT_INFO", 
            "DATA_INFO", 
            "UNCERTAINTY", 
            "ULOD_FLAG", 
            "ULOD_VALUE", 
            "LLOD_FLAG", 
            "LLOD_VALUE", 
            "DM_CONTACT_INFO", 
            "PROJECT_INFO", 
            "STIPULATIONS_ON_USE", 
            "OTHER_COMMENTS", 
            "REVISION"
        )

        self.keywords   = { k: KeywordComment(k, True) for k in required_keywords }

        self.keywords["UNCERTAINTY"].na_allowed = False
        self.keywords["REVISION"].na_allowed    = False

Christoph Knote's avatar
Christoph Knote committed
class Variable(collections.UserList):
    '''An ICARTT variable description with name, units, scale and missing value.

Christoph Knote's avatar
Christoph Knote committed
    :param shortname: Short name of the variable
    :type shortname: str

    :param units: Units of the variable
    :type units: str

Christoph Knote's avatar
Christoph Knote committed
    :param standardname: Standard name of the variable
    :type standardname: str

    :param longname: Long name of the variable
    :type longname: str

    :param vartype: Variable type (unbounded/bounded independent or dependent)
    :type vartype: enum:`icartt.Formats`, defaults to VariableType.Dependent_Variable
    :param scale: Scaling factor for the variable
    :type scale: float, defaults to 1.0

    :param miss: Missing value for the variable
    :type miss: float, defaults to -99999.0

    :param splitChar: Split character for text representation
    :type splitChar: str, defaults to ","
Christoph.Knote's avatar
Christoph.Knote committed
    '''

    def desc(self, splitChar=","):
        '''Variable description string as it appears in an ICARTT file

        :return: description string
        :rtype: str
Christoph Knote's avatar
Christoph Knote committed
        descstr = [ str(self.shortname), str(self.units) ]
        if not self.standardname is None:
            descstr += [ str(self.standardname) ]
        if not self.longname is None:
            descstr += [ str(self.longname) ]
        return splitChar.join(descstr)

    def is_valid_variablename(self, name):
        # ICARTT Standard v2 2.1.1 2)
        # Variable short names and variable standard names: 
        # Uppercase and lowercase ASCII alphanumeric characters 
        # and underscores.
        def is_ascii_alpha_or_underscore(x):
            return re.match("[a-zA-Z0-9_]", x)
        all_are_alpha_or_underscore = all( [ is_ascii_alpha_or_underscore(x) for x in name ] )
        # The first character must be a letter, 
        first_is_alpha = bool( re.match("[a-zA-Z]", name[0]) )
        # and the name can be at most 31 characters in length. 
        less_than_31_chars = len(name) <= 31

        return (all_are_alpha_or_underscore and first_is_alpha and less_than_31_chars)

    def __init__(self, shortname, units, standardname, longname, vartype=VariableType.Dependent_Variable, scale=1.0, miss=-99999.0, splitChar=","):
        '''Constructor method
        '''
Christoph Knote's avatar
Christoph Knote committed
        if not self.is_valid_variablename(shortname):
            warnings.warn("Variable short name {:s} does not comply with ICARTT standard v2".format(shortname))
Christoph Knote's avatar
Christoph Knote committed
        self.shortname      = shortname
        self.units          = units
        self.standardname   = standardname
        self.longname       = longname
        self.vartype        = vartype
        self.scale          = scale
        self.miss           = miss
Christoph Knote's avatar
Christoph Knote committed
        self.splitChar      = splitChar
Christoph.Knote's avatar
sdf
Christoph.Knote committed

class Dataset:
    '''An ICARTT dataset that can be created from scratch or read from a file,
Christoph.Knote's avatar
Christoph.Knote committed
    manipulated, and then written to a file.

    :param f: file path or file handle to use
    :type f: str or file handle or stream object, defaults to None

    :param loadData: load data as well (or only header if False)?
    :type loadData: bool, defaults to "True"

    :param splitChar: splitting character used to separate fields in a line
    :type splitChar: str, defaults to ","
Christoph.Knote's avatar
Christoph.Knote committed
    '''
Christoph.Knote's avatar
sdf
Christoph.Knote committed
    @property
    def nheader(self):
        '''Header line count

        :return: line count
        :rtype: int
Christoph.Knote's avatar
Christoph.Knote committed
        '''
Christoph Knote's avatar
Christoph Knote committed
        total = -1
        if self.format == Formats.FFI_1001:
            total = 14 + len(self.Dependent_Variables) + len(self.Special_Comments) + self.Normal_Comments.nlines
        if self.format == Formats.FFI_2110:
Christoph Knote's avatar
Christoph Knote committed
            # 2: IVAR + IBVAR
            total = 16 + 2 + len(self.Auxiliary_Variables) + len(self.Dependent_Variables) +\
                len(self.Special_Comments) + self.Normal_Comments.nlines
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        return total
Christoph.Knote's avatar
Christoph.Knote committed
    @property
    def varnames(self):
        '''Names of variables (independent and dependent)

        :return: list of variable names
        :rtype: list
Christoph.Knote's avatar
Christoph.Knote committed
        '''
        return [x for x in self.Variables.keys()]
Christoph.Knote's avatar
Christoph.Knote committed
    @property
    def times(self):
        '''Time steps of the data

        :return: list of time steps
        :rtype: list
Christoph.Knote's avatar
Christoph.Knote committed
        '''
        return [ self.Date_Collection + datetime.timedelta(seconds=x) for x in self.Independent_Variable ]

    @property
    def Variables(self):
        '''Variables (independent + dependent + auxiliary)

        :return: dictionary of all variables
        :rtype: dict of Variable(s)
        '''
        vars = {}
        if not self.Independent_Variable is None:
            vars[ self.Independent_Variable.shortname ] = self.Independent_Variable
        if not self.Independent_Bounded_Variable is None:
            vars[ self.Independent_Bounded_Variable.shortname ] = self.Independent_Bounded_Variable
        
        vars = { **vars, **self.Dependent_Variables, **self.Auxiliary_Variables }

        return vars
Christoph.Knote's avatar
Christoph.Knote committed
    def __getitem__(self, name):
        '''Shortcut to enable access to variable data by name

        :return: variable data
        :rtype: list
Christoph.Knote's avatar
Christoph.Knote committed
        '''
        return self.Variables[name]
Christoph Knote's avatar
Christoph Knote committed
    def write_header(self, f=sys.stdout):
        '''Write header

        :param f: handle to write to
        :type f: file handle or StringIO stream, defaults to sys.stdout
Christoph.Knote's avatar
Christoph.Knote committed
        '''
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        def prnt(txt):
            f.write(str(txt) + "\n")

        # Number of lines in header, file format index (most files use 1001) - comma delimited.
        versInfo = [ self.nheader, self.format.value ]
Christoph Knote's avatar
Christoph Knote committed
        if self.version is not None:
            versInfo.append( self.version )
        txt = self.splitChar.join( [ str(x) for x in versInfo ] )

Christoph Knote's avatar
Christoph Knote committed
        prnt(txt)
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # PI last name, first name/initial.
        prnt(self.PI_name)
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Organization/affiliation of PI.
        prnt(self.PI_affiliation)
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Data source description (e.g., instrument name, platform name, model name, etc.).
        prnt(self.Data_Source_Description)
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Mission name (usually the mission acronym).
        prnt(self.Mission_Name)
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # File volume number, number of file volumes (these integer values are used when the data require more than one file per day; for data that require only one file these values are set to 1, 1) - comma delimited.
        prnt(self.splitChar.join([str(self.File_Volume_Number), str(self.Total_Number_Of_File_Volumes)]))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # UTC date when data begin, UTC date of data reduction or revision - comma delimited (yyyy, mm, dd, yyyy, mm, dd).
        prnt(self.splitChar.join([datetime.datetime.strftime(x, self.splitChar.join(
            ["%Y", "%m", "%d"])) for x in [self.Date_Collection, self.Revision_Date]]))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Data Interval (This value describes the time spacing (in seconds) between consecutive data records. It is the (constant) interval between values of the independent variable. For 1 Hz data the data interval value is 1 and for 10 Hz data the value is 0.1. All intervals longer than 1 second must be reported as Start and Stop times, and the Data Interval value is set to 0. The Mid-point time is required when it is not at the average of Start and Stop times. For additional information see Section 2.5 below.).
        prnt(self.splitChar.join( [ str(x) for x in self.Data_Interval_Code ] ) )
        if self.format == Formats.FFI_2110:
Christoph.Knote's avatar
sdf
Christoph.Knote committed
            # Description or name of independent (bound) variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.).
            prnt(self.Independent_Bounded_Variable.desc(self.splitChar))
Christoph Knote's avatar
Christoph Knote committed
        # Description or name of independent variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.).
        prnt(self.Independent_Variable.desc(self.splitChar))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.).
        prnt(len(self.Dependent_Variables))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited.
        prnt(self.splitChar.join(
            ["{:.1g}".format(DVAR.scale) for DVAR in self.Dependent_Variables.values()]))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited.
        prnt(self.splitChar.join([str(DVAR.miss)
                                  for DVAR in self.Dependent_Variables.values()]))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.).
        nul = [prnt(DVAR.desc(self.splitChar)) for DVAR in self.Dependent_Variables.values()]
        if self.format == Formats.FFI_2110:
Christoph.Knote's avatar
sdf
Christoph.Knote committed
            # Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.).
            prnt(len(self.Auxiliary_Variables))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
            # Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited.
            prnt(self.splitChar.join(
                ["{:.1g}".format(AUXVAR.scale) for AUXVAR in self.Auxiliary_Variables.values()]))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
            # Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited.
            prnt(self.splitChar.join([str(AUXVAR.miss)
                                      for AUXVAR in self.Auxiliary_Variables.values()]))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
            # Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.).
            nul = [prnt(AUXVAR.desc(self.splitChar)) for AUXVAR in self.Auxiliary_Variables.values()]
Christoph.Knote's avatar
sdf
Christoph.Knote committed

        # Number of SPECIAL comment lines (Integer value indicating the number of lines of special comments, NOT including this line.).
        prnt("{:d}".format(len(self.Special_Comments)))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Special comments (Notes of problems or special circumstances unique to this file. An example would be comments/problems associated with a particular flight.).
        nul = [prnt(x) for x in self.Special_Comments]
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Number of Normal comments (i.e., number of additional lines of SUPPORTING information: Integer value indicating the number of lines of additional information, NOT including this line.).
        prnt("{:d}".format(self.Normal_Comments.nlines))
Christoph.Knote's avatar
sdf
Christoph.Knote committed
        # Normal comments (SUPPORTING information: This is the place for investigators to more completely describe the data and measurement parameters. The supporting information structure is described below as a list of key word: value pairs. Specifically include here information on the platform used, the geo-location of data, measurement technique, and data revision comments. Note the non-optional information regarding uncertainty, the upper limit of detection (ULOD) and the lower limit of detection (LLOD) for each measured variable. The ULOD and LLOD are the values, in the same units as the measurements that correspond to the flags -7777s and -8888s within the data, respectively. The last line of this section should contain all the short variable names on one line. The key words in this section are written in BOLD below and must appear in this section of the header along with the relevant data listed after the colon. For key words where information is not needed or applicable, simply enter N/A.).
        # re-create last line out of actual data if missing...
        if self.Normal_Comments.shortnames == []:
            self.Normal_Comments.shortnames = self.splitChar.join( [ self.Variables[x].shortname for x in self.Variables ] )
        nul = [prnt(x) for x in self.Normal_Comments]
    
Christoph Knote's avatar
Christoph Knote committed
    def _write_data_1001(self, prnt=lambda x: sys.stdout.write(x)):
        def p(val, var):
            return var.miss if math.isnan(val) else val

        for i in range(len(self.Independent_Variable)):
            prnt([p(self.Independent_Variable[i], self.Independent_Variable)] + \
                 [p(DVAR[i][1], DVAR) for DVAR in self.Dependent_Variables.values()])
Christoph Knote's avatar
Christoph Knote committed

    def _write_data_2110(self, prnt=lambda x: sys.stdout.write(x)):
        def p(val, var):
            return var.miss if math.isnan(val) else val

        for ival in self.Independent_Variable:
            prnt([p(ival, self.Independent_Variable)] + \
                 [p(auxval[1], AUXVAR) for AUXVAR in self.Auxiliary_Variables.values() for auxval in AUXVAR if auxval[0] == ival])
            for ibval in [b[1] for b in self.Independent_Bounded_Variable if b[0] == ival]:
                prnt([p(ibval, self.Independent_Bounded_Variable)] + \
                     [p(dval[1], DVAR) for DVAR in self.Dependent_Variables.values() for dval in DVAR if (dval[0][0] == ival) and (dval[0][1] == ibval)])
Christoph Knote's avatar
Christoph Knote committed
    def write_data(self, f=sys.stdout):
        '''Write data

        :param f: handle to write to
        :type f: file handle or StringIO stream, defaults to sys.stdout
Christoph Knote's avatar
Christoph Knote committed
        '''
Christoph Knote's avatar
Christoph Knote committed
        def prnt_data(vars):
            f.write(str(self.splitChar.join([str(x) for x in vars])) + "\n")

        if self.format == Formats.FFI_1001:
Christoph Knote's avatar
Christoph Knote committed
            nul = self._write_data_1001(prnt=prnt_data)
        elif self.format == Formats.FFI_2110:
Christoph Knote's avatar
Christoph Knote committed
            nul = self._write_data_2110(prnt=prnt_data)
        else:
            warnings.warn("Unknown file format {:d}".format(self.format))
Christoph Knote's avatar
Christoph Knote committed
    def write(self, f=sys.stdout):
        '''Write header and data

        :param f: handle to write to
        :type f: file handle or StringIO stream, defaults to sys.stdout
Christoph Knote's avatar
Christoph Knote committed
        '''
        self.write_header(f=f)
        self.write_data(f=f)
Christoph Knote's avatar
Christoph Knote committed
    def make_filename(self, date_format='%Y%m%d'):
        '''Create ICARTT-compliant file name based on the information contained in the dataset

        :param date_format: date format to use when parsing
        :type date_format: str, defaults to '%Y%m%d'

        :return: file name generated
        :rtype: string
        fn = self.dataID + "_" + self.locationID + "_" + \
            datetime.datetime.strftime(self.Date_Collection, date_format)
Christoph Knote's avatar
Christoph Knote committed
        fn += "_R" + str(self.revision) if not self.revision is None else ""
        fn += "_L" + str(self.launch) if not self.launch is None else ""
        fn += "_V" + str(self.File_Volume_Number) if self.Total_Number_Of_File_Volumes > 1 else ""
Christoph Knote's avatar
Christoph Knote committed
        return fn + ".ict"
Christoph.Knote's avatar
Christoph.Knote committed

    def is_valid_filename(self, name):
        # ICARTT standard v2 2.1.1 3)
        # Filename: Uppercase and lowercase ASCII alphanumeric 
        # characters (i.e. A-Z, a-z, 0-9), underscore, period, 
        # and hyphen. File names can be a maximum 127 
        # characters in length.
        def is_ascii_alpha(x):
            return re.match("[a-zA-Z0-9-_.]", x)
        all_ascii_alpha = all( [ is_ascii_alpha(x) for x in name ] )
        less_than_128_characters = len(name) < 128

        return all_ascii_alpha and less_than_128_characters

Christoph.Knote's avatar
Christoph.Knote committed
    def read_header(self):
        '''Read the ICARTT header (from file)
Christoph Knote's avatar
Christoph Knote committed
        class Filehandle_with_linecounter:
            def __init__(self, f, splitChar):
Christoph Knote's avatar
Christoph Knote committed
                self.splitChar = splitChar
Christoph Knote's avatar
Christoph Knote committed
            def readline(self, do_split=True):
                self.line += 1
                dmp = self.f.readline().replace('\n', '').replace('\r', '')
Christoph Knote's avatar
Christoph Knote committed
                if do_split:
                    dmp = [word.strip(' ')
                           for word in dmp.split(self.splitChar)]
Christoph Knote's avatar
Christoph Knote committed
                return dmp
Christoph.Knote's avatar
Christoph.Knote committed
        if self.input_fhandle.closed:
            self.input_fhandle = open(self.input_fhandle.name)

        try:
            f = Filehandle_with_linecounter(self.input_fhandle, self.splitChar)

            self._read_header(f)

            del f
        except:
            a = 1
        finally:
            self.input_fhandle.close()

    def _read_header(self, f):
Christoph.Knote's avatar
Christoph.Knote committed
        # line 1 - Number of lines in header, file format index (most files use
        # 1001) - comma delimited.
Christoph Knote's avatar
Christoph Knote committed
        dmp = f.readline()
Christoph Knote's avatar
Christoph Knote committed
        nheader_suggested = int(dmp[0])
        try:
            self.format = Formats(int(dmp[1]))
        except:
            raise ValueError(
                "ICARTT format {:d} not implemented".format( dmp[1] ))

Christoph Knote's avatar
Christoph Knote committed
        if len(dmp) > 2:
            self.version = dmp[2]
Christoph.Knote's avatar
Christoph.Knote committed
        # line 2 - PI last name, first name/initial.
        self.PI_name = f.readline(do_split=False)
Christoph.Knote's avatar
Christoph.Knote committed
        # line 3 - Organization/affiliation of PI.
        self.PI_affiliation = f.readline(do_split=False)
Christoph.Knote's avatar
Christoph.Knote committed
        # line 4 - Data source description (e.g., instrument name, platform name,
        # model name, etc.).
        self.Data_Source_Description = f.readline(do_split=False)
Christoph.Knote's avatar
Christoph.Knote committed
        # line 5 - Mission name (usually the mission acronym).
        self.Mission_Name = f.readline(do_split=False)
Christoph.Knote's avatar
Christoph.Knote committed

        # line 6 - File volume number, number of file volumes (these integer values
        # are used when the data require more than one file per day; for data that
        # require only one file these values are set to 1, 1) - comma delimited.
Christoph Knote's avatar
Christoph Knote committed
        dmp = f.readline()
        self.File_Volume_Number = int(dmp[0])
        self.Total_Number_Of_File_Volumes = int(dmp[1])
Christoph.Knote's avatar
Christoph.Knote committed
        # line 7 - UTC date when data begin, UTC date of data reduction or revision
        # - comma delimited (yyyy, mm, dd, yyyy, mm, dd).
Christoph Knote's avatar
Christoph Knote committed
        dmp = f.readline()
        self.Date_Collection = datetime.datetime.strptime(
            "".join(["{:s}".format(x) for x in dmp[0:3]]), '%Y%m%d')
        self.Revision_Date = datetime.datetime.strptime(
            "".join(["{:s}".format(x) for x in dmp[3:6]]), '%Y%m%d')
Christoph.Knote's avatar
Christoph.Knote committed

        # line 8 - Data Interval (This value describes the time spacing (in seconds)
        # between consecutive data records. It is the (constant) interval between
        # values of the independent variable. For 1 Hz data the data interval value
        # is 1 and for 10 Hz data the value is 0.1. All intervals longer than 1
        # second must be reported as Start and Stop times, and the Data Interval
        # value is set to 0. The Mid-point time is required when it is not at the
        # average of Start and Stop times. For additional information see Section
        # 2.5 below.).
        dmp = f.readline()
        # might have multiple entries for 2110
        self.Data_Interval_Code = [ float(x) for x in dmp ]
Christoph.Knote's avatar
Christoph.Knote committed
        # line 9 - Description or name of independent variable (This is the name
        # chosen for the start time. It always refers to the number of seconds UTC
        # from the start of the day on which measurements began. It should be noted
        # here that the independent variable should monotonically increase even when
        # crossing over to a second day.
Christoph Knote's avatar
Christoph Knote committed

        def extract_vardesc(dmp):
            shortname       = dmp[0]
            units           = dmp[1]
            standardname    = dmp[2] if len(dmp) > 2 else None
            longname        = dmp[3] if len(dmp) > 3 else None
            return shortname, units, standardname, longname

        if self.format == Formats.FFI_2110:
Christoph Knote's avatar
Christoph Knote committed
            dmp = f.readline()
Christoph Knote's avatar
Christoph Knote committed
            shortname, units, standardname, longname = extract_vardesc(dmp)
            self.Independent_Bounded_Variable = Variable(shortname, units, standardname, longname, 
Christoph Knote's avatar
Christoph Knote committed
                                splitChar=self.splitChar)
Christoph Knote's avatar
Christoph Knote committed
        dmp = f.readline()
Christoph Knote's avatar
Christoph Knote committed
        shortname, units, standardname, longname = extract_vardesc(dmp)
        self.Independent_Variable = Variable(shortname, units, standardname, longname, 
Christoph Knote's avatar
Christoph Knote committed
                            splitChar=self.splitChar)
Christoph Knote's avatar
Christoph Knote committed
        def read_vars(f):
Christoph Knote's avatar
Christoph Knote committed
            # line 10 - Number of variables (Integer value showing the number of
            # dependent variables: the total number of columns of data is this value
            # plus one.).
Christoph Knote's avatar
Christoph Knote committed
            nvar = int(f.readline()[0])
Christoph Knote's avatar
Christoph Knote committed
            # line 11- Scale factors (1 for most cases, except where grossly
            # inconvenient) - comma delimited.
            vscale = [float(x) for x in f.readline()]

Christoph Knote's avatar
Christoph Knote committed
            # line 12 - Missing data indicators (This is -9999 (or -99999, etc.) for
            # any missing data condition, except for the main time (independent)
            # variable which is never missing) - comma delimited.
            vmiss = [float(x) for x in f.readline()]
Christoph Knote's avatar
Christoph Knote committed
            # no float casting here, as we need to do string comparison lateron when reading data...
Christoph Knote's avatar
Christoph Knote committed
            # line 13 - Variable names and units (Short variable name and units are
            # required, and optional long descriptive name, in that order, and separated
            # by commas. If the variable is unitless, enter the keyword "none" for its
            # units. Each short variable name and units (and optional long name) are
            # entered on one line. The short variable name must correspond exactly to
            # the name used for that variable as a column header, i.e., the last header
            # line prior to start of data.).
Christoph Knote's avatar
Christoph Knote committed
            dmp             = f.readline()
            shortname, units, standardname, longname = extract_vardesc(dmp)
            vshortname      = [ shortname ]
            vunits          = [ units ]
            vstandardname   = [ standardname ]
            vlongname       = [ longname ]
Christoph Knote's avatar
Christoph Knote committed
            for i in range(1, nvar):
                dmp = f.readline()
Christoph Knote's avatar
Christoph Knote committed
                shortname, units, standardname, longname = extract_vardesc(dmp)
                vshortname      += [ shortname ]
                vunits          += [ units ]
                vstandardname   += [ standardname ]
                vlongname       += [ longname ]
Christoph Knote's avatar
Christoph Knote committed
            return {shortname: Variable(shortname, unit, standardname, longname, scale=scale, miss=miss, splitChar=self.splitChar) for shortname, unit, standardname, longname, scale, miss in zip(vshortname, vunits, vstandardname, vlongname, vscale, vmiss)}
Christoph Knote's avatar
Christoph Knote committed

        self.Dependent_Variables = read_vars(f)
Christoph Knote's avatar
Christoph Knote committed

        if self.format == Formats.FFI_2110:
            self.Auxiliary_Variables = read_vars(f)
Christoph.Knote's avatar
Christoph.Knote committed

        # line 14 + nvar - Number of SPECIAL comment lines (Integer value
        # indicating the number of lines of special comments, NOT including this
        # line.).
Christoph Knote's avatar
Christoph Knote committed
        nscom = int(f.readline()[0])
Christoph.Knote's avatar
Christoph.Knote committed

        # line 15 + nvar - Special comments (Notes of problems or special
        # circumstances unique to this file. An example would be comments/problems
        # associated with a particular flight.).
        self.Special_Comments = [f.readline(do_split=False) for i in range(0, nscom)]
Christoph.Knote's avatar
Christoph.Knote committed

        # line 16 + nvar + nscom - Number of Normal comments (i.e., number of
        # additional lines of SUPPORTING information: Integer value indicating the
        # number of lines of additional information, NOT including this line.).
Christoph Knote's avatar
Christoph Knote committed
        nncom = int(f.readline()[0])
Christoph.Knote's avatar
Christoph.Knote committed

        # line 17 + nvar + nscom - Normal comments (SUPPORTING information: This is
        # the place for investigators to more completely describe the data and
        # measurement parameters. The supporting information structure is described
        # below as a list of key word: value pairs. Specifically include here
        # information on the platform used, the geo-location of data, measurement
        # technique, and data revision comments. Note the non-optional information
        # regarding uncertainty, the upper limit of detection (ULOD) and the lower
        # limit of detection (LLOD) for each measured variable. The ULOD and LLOD
        # are the values, in the same units as the measurements that correspond to
        # the flags -7777's and -8888's within the data, respectively. The last line
        # of this section should contain all the "short" variable names on one line.
        # The key words in this section are written in BOLD below and must appear in
        # this section of the header along with the relevant data listed after the
        # colon. For key words where information is not needed or applicable, simply
        # enter N/A.).
        raw_ncom = [f.readline(do_split=False) for i in range(0, nncom)]
        self.Normal_Comments.ingest(raw_ncom)
Christoph Knote's avatar
Christoph Knote committed
        self.nheader_file = f.line
Christoph Knote's avatar
Christoph Knote committed
        if self.nheader != nheader_suggested:
            warnings.warn("Number of header lines suggested in line 1 ({:d}) do not match actual header lines read ({:d})".format(
                nheader_suggested, self.nheader))
Christoph.Knote's avatar
Christoph.Knote committed

    def read_data(self):
        '''Read ICARTT data (from file)
Christoph.Knote's avatar
Christoph.Knote committed
        if self.input_fhandle.closed:
            self.input_fhandle = open(self.input_fhandle.name)

        try:
            nul = [self.input_fhandle.readline() for i in range(self.nheader_file)]
Christoph.Knote's avatar
Christoph.Knote committed

            raw = [line.split(self.splitChar) for line in self.input_fhandle]

            nul = self.data.add_bulk(raw)
        finally:
            self.input_fhandle.close()
Christoph.Knote's avatar
Christoph.Knote committed

    def read(self):
        '''Read ICARTT data and header
Christoph.Knote's avatar
Christoph.Knote committed
        '''
Christoph.Knote's avatar
Christoph.Knote committed
        self.read_header()
        self.end_define_mode()
Christoph.Knote's avatar
Christoph.Knote committed
        self.read_data()
Christoph Knote's avatar
Christoph Knote committed
    def __del__(self):
        try:
            if not self.input_fhandle.closed:
                self.input_fhandle.close()
        except:
            pass
    
    def end_define_mode(self):
        self.DEFINE_MODE = False

        # create data store
        if self.format == Formats.FFI_1001:
            self.data = DataStore_1001(self.Independent_Variable, self.Dependent_Variables )
        elif self.format == Formats.FFI_2110:
            self.data = DataStore_2110(self.Independent_Variable, self.Independent_Bounded_Variable, self.Auxiliary_Variables, self.Dependent_Variables )

    def __init__(self, f=None, loadData=True, splitChar=",", format=Formats.FFI_1001):
        '''Constructor method
Christoph.Knote's avatar
Christoph.Knote committed
        '''
        self.format                         = format
        self.version                        = None

        self.dataID                         = 'dataID'
        self.locationID                     = 'locationID'

        self.revision                       = 0
        self.launch                         = None
        self.File_Volume_Number             = 1
        self.Total_Number_Of_File_Volumes   = 1

        self.PI_name                        = 'Mustermann, Martin'
        self.PI_affiliation                 = 'Musterinstitut'
        self.Data_Source_Description        = 'Musterdatenprodukt'
        self.Mission_Name                   = 'MUSTEREX'
        self.Date_Collection                = datetime.datetime.today()
        self.Revision_Date                  = datetime.datetime.today()
        self.Data_Interval_Code             = [ 0.0 ]
        self.Independent_Variable           = None
        self.Independent_Bounded_Variable   = None
        self.Auxiliary_Variables            = {}
        self.Dependent_Variables            = {}

        self.Special_Comments               = []
        self.Normal_Comments                = StandardNormalComments()
Christoph Knote's avatar
Christoph Knote committed
        # Standard v2.0 for normal comments requires all keywords present,
        # might not be the case - then reading data will fail
        self.nheader_file                   = -1

        self.splitChar                      = splitChar

        self.data                           = None

        self.DEFINE_MODE                    = True
Christoph.Knote's avatar
Christoph.Knote committed
        # read data if f is not None
        if f is not None:
Christoph Knote's avatar
Christoph Knote committed
            if isinstance(f, str):
Christoph.Knote's avatar
Christoph.Knote committed
                self.input_fhandle = open(f, 'r')
            else:
                self.input_fhandle = f
Christoph.Knote's avatar
Christoph.Knote committed
            self.read_header()
            if loadData:
                self.end_define_mode()
Christoph.Knote's avatar
Christoph.Knote committed
                self.read_data()