From 0ba20a71be19b94ae10e3d480904274040de4af3 Mon Sep 17 00:00:00 2001 From: Christoph Knote Date: Wed, 16 Feb 2022 17:48:14 +0100 Subject: [PATCH] WIP: overhaul of data model --- docs/source/index.rst | 76 ++++++- icartt/dataset.py | 451 ++++++++++++++++++++---------------------- 2 files changed, 282 insertions(+), 245 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 2f932da..1bc707f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -69,14 +69,82 @@ Creating a new dataset ^^^^^^^^^^^^^^^^^^^^^^^ :: - - imoprt icartt - ict = icartt.Dataset(format=icartt.Formats.FFI_1001) +import icartt +import datetime - ict.IVAR +ict = icartt.Dataset(format=icartt.Formats.FFI1001) +ict.PIName = 'Knote, Christoph' +ict.PIAffiliation = 'Faculty of Medicine, University Augsburg, Germany' +ict.dataSourceDescription = 'Example data' +ict.missionName = 'MBEES' +ict.dateOfCollection = datetime.datetime.today() +ict.dateOfRevision = datetime.datetime.today() +ict.dataIntervalCode = [ 0 ] + +ict.independentVariable = icartt.Variable( 'Time_Start', + 'seconds_from_0_hours_on_valid_date', + 'Time_Start', + 'Time_Start', + vartype=icartt.VariableType.IndependentVariable, + scale=1.0, miss=-9999999) + +#ict.independentBoundedVariable = None +#ict.auxiliaryVariables = ... + +ict.dependentVariables['Time_Stop'] = icartt.Variable( 'Time_Stop', + 'seconds_from_0_hours_on_valid_date', + 'Time_Stop', + 'Time_Stop', + scale=1.0, miss=-9999999) + +ict.dependentVariables['Payload'] = icartt.Variable( 'Payload', + 'some_units', + 'Payload', + 'Payload', + scale=1.0, miss=-9999999) + +ict.specialComments.append("Some comments on this dataset:") +ict.specialComments.append("They are just examples!") +ict.specialComments.append("Adapt as needed.") + + +ict.endDefineMode() + + +# Three ways to add data: + +# 1) simple (single data line) + +ict.data.add( Time_Start = 12.3, Time_Stop = 12.5, Payload = 23789423.2e5 ) + +# Let's check: + +ict.write() + +# Seems to have worked! + +# 2) as dictionary (single data line) + +ict.data.add( **{ 'Time_Start': 12.6, 'Time_Stop': 13.1, 'Payload': 324235644.1e5 } ) + +# (note, we are merely exploding the dictionary to resemble method 1) + +# 3) as NumPy array (bulk) + +import numpy as np + +data = np.array( [ (13.4, 14.0, 2348925e5), (14.1, 14.9, 23425634e5) ] ) +ict.data.addBulk( data ) + +# Note: you are responsible to ensure that the order of elements in a data line +# corresponds to variable listing below: +print( [ x for x in ict.variables ] ) + +# Note: for single lines, you still need to make it an array! +data = np.array( [ (15.4, 15.0, 52452495290e5) ] ) API ---- diff --git a/icartt/dataset.py b/icartt/dataset.py index c881b18..32f107c 100644 --- a/icartt/dataset.py +++ b/icartt/dataset.py @@ -2,29 +2,28 @@ import datetime import sys import collections import re -import math import warnings import numpy as np -from io import StringIO from enum import Enum, IntEnum class Formats(IntEnum): '''File Format Indicators (FFI) ''' - FFI_1001 = 1001 - FFI_2110 = 2110 -# FFI_2310 = 2310 + FFI1001 = 1001 + FFI2110 = 2110 +# FFI2310 = 2310 class VariableType(Enum): - Independent_Variable = 1 - Independent_Bounded_Variable = 2 - Dependent_Variable = 3 + IndependentVariable = 1 + IndependentBoundedVariable = 2 + AuxiliaryVariable = 3 + DependentVariable = 4 def sanitize(val, miss): return float(val) if not float(val) == float(miss) else np.NaN -class DataStore_1001: +class DataStore1001: def __init__(self, ivar, dvars): self.ivarname = ivar.shortname # @@ -37,7 +36,13 @@ class DataStore_1001: def __getitem__(self, s=slice(None)): return self.data[s] # - def add_bulk(self, raw): + def addBulk(self, raw): + nlines, nvars = raw.shape + for cur in range(nlines): + newdata = { x: raw[cur][i] for i, x in enumerate( self.varnames ) } + self.add(**newdata) +# + def addBulkFromTxt(self, raw): for cur in range(len(raw)): newdata = { x: raw[cur][i] for i, x in enumerate( self.varnames ) } self.add(**newdata) @@ -62,11 +67,18 @@ class DataStore_1001: else: self.data = np.append( self.data, newline ) + def denanify(self, d): + dd = d.copy() + for k, miss in self.missvals.items(): + dd[k][ np.isnan(dd[k]) ] = miss + return dd + def write(self, f=sys.stdout, fmt="%.1f", delimiter=", "): - np.savetxt( f, self.data, fmt=fmt, delimiter=delimiter ) + d = self.denanify(self.data) + np.savetxt( f, d, fmt=fmt, delimiter=delimiter ) -class DataStore_2110: +class DataStore2110: def __init__(self, ivar, ibvar, auxvars, dvars): self.ivarname = ivar.shortname self.ibvarname = ibvar.shortname @@ -88,29 +100,29 @@ class DataStore_2110: self.ibvar = ibvar self.dvars = dvars # - def add_auxline(self, auxline): + def addAuxline(self, auxline): newdata = { x: auxline[i] for i, x in enumerate( [ self.ivarname ] + self.auxvarnames ) } self.add(**newdata) - def add_deplines(self, ivar, raw): + def addDeplines(self, ivar, raw): for cur in range(len(raw)): newdata = { x: raw[cur][i] for i, x in enumerate( [ self.ibvarname ] + self.dvarnames ) } newdata.update( { self.ivarname: ivar }) self.add(**newdata) - def add_bulk(self, raw): + def addBulkFromTxt(self, raw): cur = 0 while cur < len(raw): ivarvalue = sanitize(raw[cur][0], self.missvals[self.ivarname]) - self.add_auxline(raw[cur]) + self.addAuxline(raw[cur]) cur += 1 - # stupid, but at first auxline added, nprimary_dat ist a 0-dim array... - nprimary_data = self.data[ivarvalue]['AUX'][self.nauxvarname] - nprimary = int(nprimary_data) if nprimary_data.shape == () else int(nprimary_data[-1]) + # stupid, but at first auxline added, nprimaryData ist a 0-dim array... + nprimaryData = self.data[ivarvalue]['AUX'][self.nauxvarname] + nprimary = int(nprimaryData) if nprimaryData.shape == () else int(nprimaryData[-1]) - self.add_deplines(ivarvalue, raw[ cur:(cur+nprimary) ]) + self.addDeplines(ivarvalue, raw[ cur:(cur+nprimary) ]) cur += nprimary @@ -126,8 +138,8 @@ class DataStore_2110: # and we create the whole dataset if needed if not ivarvalue in self.data.keys(): self.data[ ivarvalue ] = { - "AUX": DataStore_1001( self.ivar, self.auxvars ), - "DEP": DataStore_1001( self.ibvar, self.dvars ) + "AUX": DataStore1001( self.ivar, self.auxvars ), + "DEP": DataStore1001( self.ibvar, self.dvars ) } self.data[ ivarvalue ]['AUX'].add( **kwargs ) @@ -147,9 +159,9 @@ class DataStore_2110: self.data[ ivarvalue ]['DEP'].write(f, fmt=fmt, delimiter=delimiter) class KeywordComment(): - def __init__(self, key, na_allowed): + def __init__(self, key, naAllowed): self.key = key - self.na_allowed = na_allowed + self.naAllowed = naAllowed self.data = [] def append(self, data): self.data.append(data) @@ -179,18 +191,18 @@ class StandardNormalComments(collections.UserList): # and must include all required “KEYWORD: value” pairs # in the order listed in the ICARTT documentation. - current_keyword = None + currentKeyword = None for l in raw: - possible_keyword = l.split(":")[0].strip() - if possible_keyword in self.keywords or re.match("R[a-zA-Z0-9]{1,2}[ ]*", possible_keyword): - current_keyword = possible_keyword - if not current_keyword in self.keywords: # for the revisions only... - self.keywords[current_keyword] = KeywordComment(current_keyword, False) + possibleKeyword = l.split(":")[0].strip() + if possibleKeyword in self.keywords or re.match("R[a-zA-Z0-9]{1,2}[ ]*", possibleKeyword): + currentKeyword = possibleKeyword + if not currentKeyword in self.keywords: # for the revisions only... + self.keywords[currentKeyword] = KeywordComment(currentKeyword, False) - if current_keyword is None: + if currentKeyword is None: self.freeform.append(l) else: - self.keywords[current_keyword].append( l.replace(l.split(":")[0] + ":", "").strip() ) + self.keywords[currentKeyword].append( l.replace(l.split(":")[0] + ":", "").strip() ) for key in self.keywords: if self.keywords[key].data == []: @@ -200,7 +212,7 @@ class StandardNormalComments(collections.UserList): self.freeform = [] self.shortnames = [] - required_keywords = ( + requiredKeywords = ( "PI_CONTACT_INFO", "PLATFORM", "LOCATION", @@ -219,12 +231,12 @@ class StandardNormalComments(collections.UserList): "REVISION" ) - self.keywords = { k: KeywordComment(k, True) for k in required_keywords } + self.keywords = { k: KeywordComment(k, True) for k in requiredKeywords } - self.keywords["UNCERTAINTY"].na_allowed = False - self.keywords["REVISION"].na_allowed = False + self.keywords["UNCERTAINTY"].naAllowed = False + self.keywords["REVISION"].naAllowed = False -class Variable(collections.UserList): +class Variable: '''An ICARTT variable description with name, units, scale and missing value. :param shortname: Short name of the variable @@ -240,16 +252,13 @@ class Variable(collections.UserList): :type longname: str :param vartype: Variable type (unbounded/bounded independent or dependent) - :type vartype: enum:`icartt.Formats`, defaults to VariableType.Dependent_Variable + :type vartype: enum:`icartt.Formats`, defaults to VariableType.dependentVariable :param scale: Scaling factor for the variable :type scale: float, defaults to 1.0 :param miss: Missing value for the variable :type miss: float, defaults to -99999.0 - - :param splitChar: Split character for text representation - :type splitChar: str, defaults to "," ''' def desc(self, splitChar=","): @@ -265,25 +274,25 @@ class Variable(collections.UserList): descstr += [ str(self.longname) ] return splitChar.join(descstr) - def is_valid_variablename(self, name): + def isValidVariablename(self, name): # ICARTT Standard v2 2.1.1 2) # Variable short names and variable standard names: # Uppercase and lowercase ASCII alphanumeric characters # and underscores. - def is_ascii_alpha_or_underscore(x): + def isAsciiAlphaOrUnderscore(x): return re.match("[a-zA-Z0-9_]", x) - all_are_alpha_or_underscore = all( [ is_ascii_alpha_or_underscore(x) for x in name ] ) + allAreAlphaOrUnderscore = all( [ isAsciiAlphaOrUnderscore(x) for x in name ] ) # The first character must be a letter, - first_is_alpha = bool( re.match("[a-zA-Z]", name[0]) ) + firstIsAlpha = bool( re.match("[a-zA-Z]", name[0]) ) # and the name can be at most 31 characters in length. - less_than_31_chars = len(name) <= 31 + lessThan31Chars = len(name) <= 31 - return (all_are_alpha_or_underscore and first_is_alpha and less_than_31_chars) + return (allAreAlphaOrUnderscore and firstIsAlpha and lessThan31Chars) - def __init__(self, shortname, units, standardname, longname, vartype=VariableType.Dependent_Variable, scale=1.0, miss=-99999.0, splitChar=","): + def __init__(self, shortname, units, standardname, longname, vartype=VariableType.DependentVariable, scale=1.0, miss=-99999.0): '''Constructor method ''' - if not self.is_valid_variablename(shortname): + if not self.isValidVariablename(shortname): warnings.warn("Variable short name {:s} does not comply with ICARTT standard v2".format(shortname)) self.shortname = shortname @@ -293,8 +302,9 @@ class Variable(collections.UserList): self.vartype = vartype self.scale = scale self.miss = miss - - self.splitChar = splitChar + + def __repr__(self): + return "ICARTT Variable description" class Dataset: @@ -320,23 +330,14 @@ class Dataset: :rtype: int ''' total = -1 - if self.format == Formats.FFI_1001: - total = 14 + len(self.Dependent_Variables) + len(self.Special_Comments) + self.Normal_Comments.nlines - if self.format == Formats.FFI_2110: + if self.format == Formats.FFI1001: + total = 14 + len(self.dependentVariables) + len(self.specialComments) + self.normalComments.nlines + if self.format == Formats.FFI2110: # 2: IVAR + IBVAR - total = 16 + 2 + len(self.Auxiliary_Variables) + len(self.Dependent_Variables) +\ - len(self.Special_Comments) + self.Normal_Comments.nlines + total = 16 + 2 + len(self.auxiliaryVariables) + len(self.dependentVariables) +\ + len(self.specialComments) + self.normalComments.nlines return total - @property - def varnames(self): - '''Names of variables (independent and dependent) - - :return: list of variable names - :rtype: list - ''' - return [x for x in self.Variables.keys()] - @property def times(self): '''Time steps of the data @@ -344,34 +345,26 @@ class Dataset: :return: list of time steps :rtype: list ''' - return [ self.Date_Collection + datetime.timedelta(seconds=x) for x in self.Independent_Variable ] + return [ self.dateOfCollection + datetime.timedelta(seconds=x) for x in self.independentVariable ] @property - def Variables(self): + def variables(self): '''Variables (independent + dependent + auxiliary) :return: dictionary of all variables :rtype: dict of Variable(s) ''' vars = {} - if not self.Independent_Variable is None: - vars[ self.Independent_Variable.shortname ] = self.Independent_Variable - if not self.Independent_Bounded_Variable is None: - vars[ self.Independent_Bounded_Variable.shortname ] = self.Independent_Bounded_Variable + if not self.independentVariable is None: + vars[ self.independentVariable.shortname ] = self.independentVariable + if not self.independentBoundedVariable is None: + vars[ self.independentBoundedVariable.shortname ] = self.independentBoundedVariable - vars = { **vars, **self.Dependent_Variables, **self.Auxiliary_Variables } + vars = { **vars, **self.dependentVariables, **self.auxiliaryVariables } return vars - def __getitem__(self, name): - '''Shortcut to enable access to variable data by name - - :return: variable data - :rtype: list - ''' - return self.Variables[name] - - def write_header(self, f=sys.stdout): + def writeHeader(self, f=sys.stdout, delimiter=", "): '''Write header :param f: handle to write to @@ -384,174 +377,147 @@ class Dataset: versInfo = [ self.nheader, self.format.value ] if self.version is not None: versInfo.append( self.version ) - txt = self.splitChar.join( [ str(x) for x in versInfo ] ) + txt = delimiter.join( [ str(x) for x in versInfo ] ) prnt(txt) # PI last name, first name/initial. - prnt(self.PI_name) + prnt(self.PIName) # Organization/affiliation of PI. - prnt(self.PI_affiliation) + prnt(self.PIAffiliation) # Data source description (e.g., instrument name, platform name, model name, etc.). - prnt(self.Data_Source_Description) + prnt(self.dataSourceDescription) # Mission name (usually the mission acronym). - prnt(self.Mission_Name) + prnt(self.missionName) # File volume number, number of file volumes (these integer values are used when the data require more than one file per day; for data that require only one file these values are set to 1, 1) - comma delimited. - prnt(self.splitChar.join([str(self.File_Volume_Number), str(self.Total_Number_Of_File_Volumes)])) + prnt(delimiter.join([str(self.fileVolumeNumber), str(self.totalNumberOfFileVolumes)])) # UTC date when data begin, UTC date of data reduction or revision - comma delimited (yyyy, mm, dd, yyyy, mm, dd). - prnt(self.splitChar.join([datetime.datetime.strftime(x, self.splitChar.join( - ["%Y", "%m", "%d"])) for x in [self.Date_Collection, self.Revision_Date]])) + prnt(delimiter.join([datetime.datetime.strftime(x, delimiter.join( + ["%Y", "%m", "%d"])) for x in [self.dateOfCollection, self.dateOfRevision]])) # Data Interval (This value describes the time spacing (in seconds) between consecutive data records. It is the (constant) interval between values of the independent variable. For 1 Hz data the data interval value is 1 and for 10 Hz data the value is 0.1. All intervals longer than 1 second must be reported as Start and Stop times, and the Data Interval value is set to 0. The Mid-point time is required when it is not at the average of Start and Stop times. For additional information see Section 2.5 below.). - prnt(self.splitChar.join( [ str(x) for x in self.Data_Interval_Code ] ) ) - if self.format == Formats.FFI_2110: + prnt(delimiter.join( [ str(x) for x in self.dataIntervalCode ] ) ) + if self.format == Formats.FFI2110: # Description or name of independent (bound) variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.). - prnt(self.Independent_Bounded_Variable.desc(self.splitChar)) + prnt(self.independentBoundedVariable.desc(delimiter)) # Description or name of independent variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.). - prnt(self.Independent_Variable.desc(self.splitChar)) + prnt(self.independentVariable.desc(delimiter)) # Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.). - prnt(len(self.Dependent_Variables)) + prnt(len(self.dependentVariables)) # Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited. - prnt(self.splitChar.join( - ["{:.1g}".format(DVAR.scale) for DVAR in self.Dependent_Variables.values()])) + prnt(delimiter.join( + ["{:.1g}".format(DVAR.scale) for DVAR in self.dependentVariables.values()])) # Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited. - prnt(self.splitChar.join([str(DVAR.miss) - for DVAR in self.Dependent_Variables.values()])) + prnt(delimiter.join([str(DVAR.miss) + for DVAR in self.dependentVariables.values()])) # Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.). - nul = [prnt(DVAR.desc(self.splitChar)) for DVAR in self.Dependent_Variables.values()] - if self.format == Formats.FFI_2110: + nul = [prnt(DVAR.desc(delimiter)) for DVAR in self.dependentVariables.values()] + if self.format == Formats.FFI2110: # Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.). - prnt(len(self.Auxiliary_Variables)) + prnt(len(self.auxiliaryVariables)) # Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited. - prnt(self.splitChar.join( - ["{:.1g}".format(AUXVAR.scale) for AUXVAR in self.Auxiliary_Variables.values()])) + prnt(delimiter.join( + ["{:.1g}".format(AUXVAR.scale) for AUXVAR in self.auxiliaryVariables.values()])) # Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited. - prnt(self.splitChar.join([str(AUXVAR.miss) - for AUXVAR in self.Auxiliary_Variables.values()])) + prnt(delimiter.join([str(AUXVAR.miss) + for AUXVAR in self.auxiliaryVariables.values()])) # Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.). - nul = [prnt(AUXVAR.desc(self.splitChar)) for AUXVAR in self.Auxiliary_Variables.values()] + nul = [prnt(AUXVAR.desc(delimiter)) for AUXVAR in self.auxiliaryVariables.values()] # Number of SPECIAL comment lines (Integer value indicating the number of lines of special comments, NOT including this line.). - prnt("{:d}".format(len(self.Special_Comments))) + prnt("{:d}".format(len(self.specialComments))) # Special comments (Notes of problems or special circumstances unique to this file. An example would be comments/problems associated with a particular flight.). - nul = [prnt(x) for x in self.Special_Comments] + nul = [prnt(x) for x in self.specialComments] # Number of Normal comments (i.e., number of additional lines of SUPPORTING information: Integer value indicating the number of lines of additional information, NOT including this line.). - prnt("{:d}".format(self.Normal_Comments.nlines)) + prnt("{:d}".format(self.normalComments.nlines)) # Normal comments (SUPPORTING information: This is the place for investigators to more completely describe the data and measurement parameters. The supporting information structure is described below as a list of key word: value pairs. Specifically include here information on the platform used, the geo-location of data, measurement technique, and data revision comments. Note the non-optional information regarding uncertainty, the upper limit of detection (ULOD) and the lower limit of detection (LLOD) for each measured variable. The ULOD and LLOD are the values, in the same units as the measurements that correspond to the flags -7777s and -8888s within the data, respectively. The last line of this section should contain all the short variable names on one line. The key words in this section are written in BOLD below and must appear in this section of the header along with the relevant data listed after the colon. For key words where information is not needed or applicable, simply enter N/A.). # re-create last line out of actual data if missing... - if self.Normal_Comments.shortnames == []: - self.Normal_Comments.shortnames = self.splitChar.join( [ self.Variables[x].shortname for x in self.Variables ] ) - nul = [prnt(x) for x in self.Normal_Comments] + if self.normalComments.shortnames == []: + self.normalComments.shortnames = delimiter.join( [ self.variables[x].shortname for x in self.variables ] ) + nul = [prnt(x) for x in self.normalComments] - def _write_data_1001(self, prnt=lambda x: sys.stdout.write(x)): - def p(val, var): - return var.miss if math.isnan(val) else val - - for i in range(len(self.Independent_Variable)): - prnt([p(self.Independent_Variable[i], self.Independent_Variable)] + \ - [p(DVAR[i][1], DVAR) for DVAR in self.Dependent_Variables.values()]) - - def _write_data_2110(self, prnt=lambda x: sys.stdout.write(x)): - def p(val, var): - return var.miss if math.isnan(val) else val - - for ival in self.Independent_Variable: - prnt([p(ival, self.Independent_Variable)] + \ - [p(auxval[1], AUXVAR) for AUXVAR in self.Auxiliary_Variables.values() for auxval in AUXVAR if auxval[0] == ival]) - for ibval in [b[1] for b in self.Independent_Bounded_Variable if b[0] == ival]: - prnt([p(ibval, self.Independent_Bounded_Variable)] + \ - [p(dval[1], DVAR) for DVAR in self.Dependent_Variables.values() for dval in DVAR if (dval[0][0] == ival) and (dval[0][1] == ibval)]) - - def write_data(self, f=sys.stdout): + def writeData(self, f=sys.stdout, fmt="%.1f", delimiter=", "): '''Write data :param f: handle to write to :type f: file handle or StringIO stream, defaults to sys.stdout ''' - def prnt_data(vars): - f.write(str(self.splitChar.join([str(x) for x in vars])) + "\n") - - if self.format == Formats.FFI_1001: - nul = self._write_data_1001(prnt=prnt_data) - elif self.format == Formats.FFI_2110: - nul = self._write_data_2110(prnt=prnt_data) - else: - warnings.warn("Unknown file format {:d}".format(self.format)) + self.data.write( fmt=fmt, delimiter=delimiter ) - def write(self, f=sys.stdout): + def write(self, f=sys.stdout, fmt="%.1f", delimiter=", "): '''Write header and data :param f: handle to write to :type f: file handle or StringIO stream, defaults to sys.stdout ''' - self.write_header(f=f) - self.write_data(f=f) + self.writeHeader(f=f, delimiter=delimiter) + self.writeData(f=f, fmt=fmt, delimiter=delimiter) - def make_filename(self, date_format='%Y%m%d'): + def makeFileName(self, dateFormat='%Y%m%d'): '''Create ICARTT-compliant file name based on the information contained in the dataset - :param date_format: date format to use when parsing - :type date_format: str, defaults to '%Y%m%d' + :param dateFormat: date format to use when parsing + :type dateFormat: str, defaults to '%Y%m%d' :return: file name generated :rtype: string ''' fn = self.dataID + "_" + self.locationID + "_" + \ - datetime.datetime.strftime(self.Date_Collection, date_format) + datetime.datetime.strftime(self.dateOfCollection, dateFormat) fn += "_R" + str(self.revision) if not self.revision is None else "" fn += "_L" + str(self.launch) if not self.launch is None else "" - fn += "_V" + str(self.File_Volume_Number) if self.Total_Number_Of_File_Volumes > 1 else "" + fn += "_V" + str(self.fileVolumeNumber) if self.totalNumberOfFileVolumes > 1 else "" return fn + ".ict" - def is_valid_filename(self, name): + def isValidFileName(self, name): # ICARTT standard v2 2.1.1 3) # Filename: Uppercase and lowercase ASCII alphanumeric # characters (i.e. A-Z, a-z, 0-9), underscore, period, # and hyphen. File names can be a maximum 127 # characters in length. - def is_ascii_alpha(x): + def isAsciiAlpha(x): return re.match("[a-zA-Z0-9-_.]", x) - all_ascii_alpha = all( [ is_ascii_alpha(x) for x in name ] ) - less_than_128_characters = len(name) < 128 + allAsciiAlpha = all( [ isAsciiAlpha(x) for x in name ] ) + lessThan128Characters = len(name) < 128 - return all_ascii_alpha and less_than_128_characters + return allAsciiAlpha and lessThan128Characters - def read_header(self): + def readHeader(self, splitChar=","): '''Read the ICARTT header (from file) ''' - class Filehandle_with_linecounter: + class FilehandleWithLinecounter: def __init__(self, f, splitChar): self.f = f self.line = 0 self.splitChar = splitChar - def readline(self, do_split=True): + def readline(self, doSplit=True): self.line += 1 dmp = self.f.readline().replace('\n', '').replace('\r', '') - if do_split: + if doSplit: dmp = [word.strip(' ') for word in dmp.split(self.splitChar)] return dmp - if self.input_fhandle.closed: - self.input_fhandle = open(self.input_fhandle.name) + if self.inputFhandle.closed: + self.inputFhandle = open(self.inputFhandle.name) try: - f = Filehandle_with_linecounter(self.input_fhandle, self.splitChar) + f = FilehandleWithLinecounter(self.inputFhandle, splitChar) - self._read_header(f) + self._readHeader(f) del f except: a = 1 finally: - self.input_fhandle.close() + self.inputFhandle.close() - def _read_header(self, f): + def _readHeader(self, f): # line 1 - Number of lines in header, file format index (most files use # 1001) - comma delimited. dmp = f.readline() - nheader_suggested = int(dmp[0]) + nheaderSuggested = int(dmp[0]) try: self.format = Formats(int(dmp[1])) except: @@ -562,31 +528,31 @@ class Dataset: self.version = dmp[2] # line 2 - PI last name, first name/initial. - self.PI_name = f.readline(do_split=False) + self.PIName = f.readline(doSplit=False) # line 3 - Organization/affiliation of PI. - self.PI_affiliation = f.readline(do_split=False) + self.PIAffiliation = f.readline(doSplit=False) # line 4 - Data source description (e.g., instrument name, platform name, # model name, etc.). - self.Data_Source_Description = f.readline(do_split=False) + self.dataSourceDescription = f.readline(doSplit=False) # line 5 - Mission name (usually the mission acronym). - self.Mission_Name = f.readline(do_split=False) + self.missionName = f.readline(doSplit=False) # line 6 - File volume number, number of file volumes (these integer values # are used when the data require more than one file per day; for data that # require only one file these values are set to 1, 1) - comma delimited. dmp = f.readline() - self.File_Volume_Number = int(dmp[0]) - self.Total_Number_Of_File_Volumes = int(dmp[1]) + self.fileVolumeNumber = int(dmp[0]) + self.totalNumberOfFileVolumes = int(dmp[1]) # line 7 - UTC date when data begin, UTC date of data reduction or revision # - comma delimited (yyyy, mm, dd, yyyy, mm, dd). dmp = f.readline() - self.Date_Collection = datetime.datetime.strptime( + self.dateOfCollection = datetime.datetime.strptime( "".join(["{:s}".format(x) for x in dmp[0:3]]), '%Y%m%d') - self.Revision_Date = datetime.datetime.strptime( + self.dateOfRevision = datetime.datetime.strptime( "".join(["{:s}".format(x) for x in dmp[3:6]]), '%Y%m%d') # line 8 - Data Interval (This value describes the time spacing (in seconds) @@ -599,7 +565,7 @@ class Dataset: # 2.5 below.). dmp = f.readline() # might have multiple entries for 2110 - self.Data_Interval_Code = [ float(x) for x in dmp ] + self.dataIntervalCode = [ float(x) for x in dmp ] # line 9 - Description or name of independent variable (This is the name # chosen for the start time. It always refers to the number of seconds UTC @@ -607,25 +573,25 @@ class Dataset: # here that the independent variable should monotonically increase even when # crossing over to a second day. - def extract_vardesc(dmp): + def extractVardesc(dmp): shortname = dmp[0] units = dmp[1] standardname = dmp[2] if len(dmp) > 2 else None longname = dmp[3] if len(dmp) > 3 else None return shortname, units, standardname, longname - if self.format == Formats.FFI_2110: + if self.format == Formats.FFI2110: dmp = f.readline() - shortname, units, standardname, longname = extract_vardesc(dmp) - self.Independent_Bounded_Variable = Variable(shortname, units, standardname, longname, - splitChar=self.splitChar) + shortname, units, standardname, longname = extractVardesc(dmp) + self.independentBoundedVariable = Variable(shortname, units, standardname, longname, + vartype=VariableType.IndependentBoundedVariable) dmp = f.readline() - shortname, units, standardname, longname = extract_vardesc(dmp) - self.Independent_Variable = Variable(shortname, units, standardname, longname, - splitChar=self.splitChar) + shortname, units, standardname, longname = extractVardesc(dmp) + self.independentVariable = Variable(shortname, units, standardname, longname, + vartype=VariableType.IndependentVariable) - def read_vars(f): + def readVars(f, vtype): # line 10 - Number of variables (Integer value showing the number of # dependent variables: the total number of columns of data is this value # plus one.). @@ -649,7 +615,7 @@ class Dataset: # the name used for that variable as a column header, i.e., the last header # line prior to start of data.). dmp = f.readline() - shortname, units, standardname, longname = extract_vardesc(dmp) + shortname, units, standardname, longname = extractVardesc(dmp) vshortname = [ shortname ] vunits = [ units ] vstandardname = [ standardname ] @@ -657,18 +623,18 @@ class Dataset: for i in range(1, nvar): dmp = f.readline() - shortname, units, standardname, longname = extract_vardesc(dmp) + shortname, units, standardname, longname = extractVardesc(dmp) vshortname += [ shortname ] vunits += [ units ] vstandardname += [ standardname ] vlongname += [ longname ] - return {shortname: Variable(shortname, unit, standardname, longname, scale=scale, miss=miss, splitChar=self.splitChar) for shortname, unit, standardname, longname, scale, miss in zip(vshortname, vunits, vstandardname, vlongname, vscale, vmiss)} + return { shortname: Variable(shortname, unit, standardname, longname, scale=scale, miss=miss, vartype=vtype) for shortname, unit, standardname, longname, scale, miss in zip(vshortname, vunits, vstandardname, vlongname, vscale, vmiss)} - self.Dependent_Variables = read_vars(f) + self.dependentVariables = readVars(f, VariableType.DependentVariable) - if self.format == Formats.FFI_2110: - self.Auxiliary_Variables = read_vars(f) + if self.format == Formats.FFI2110: + self.auxiliaryVariables = readVars(f, VariableType.AuxiliaryVariable) # line 14 + nvar - Number of SPECIAL comment lines (Integer value # indicating the number of lines of special comments, NOT including this @@ -678,7 +644,7 @@ class Dataset: # line 15 + nvar - Special comments (Notes of problems or special # circumstances unique to this file. An example would be comments/problems # associated with a particular flight.). - self.Special_Comments = [f.readline(do_split=False) for i in range(0, nscom)] + self.specialComments = [f.readline(doSplit=False) for i in range(0, nscom)] # line 16 + nvar + nscom - Number of Normal comments (i.e., number of # additional lines of SUPPORTING information: Integer value indicating the @@ -700,56 +666,60 @@ class Dataset: # this section of the header along with the relevant data listed after the # colon. For key words where information is not needed or applicable, simply # enter N/A.). - raw_ncom = [f.readline(do_split=False) for i in range(0, nncom)] - self.Normal_Comments.ingest(raw_ncom) + rawNcom = [f.readline(doSplit=False) for i in range(0, nncom)] + self.normalComments.ingest(rawNcom) - self.nheader_file = f.line + self.nheaderFile = f.line - if self.nheader != nheader_suggested: + if self.nheader != nheaderSuggested: warnings.warn("Number of header lines suggested in line 1 ({:d}) do not match actual header lines read ({:d})".format( - nheader_suggested, self.nheader)) + nheaderSuggested, self.nheader)) - def read_data(self): + def readData(self, splitChar=","): '''Read ICARTT data (from file) ''' - if self.input_fhandle.closed: - self.input_fhandle = open(self.input_fhandle.name) + if self.inputFhandle.closed: + self.inputFhandle = open(self.inputFhandle.name) try: - nul = [self.input_fhandle.readline() for i in range(self.nheader_file)] + nul = [self.inputFhandle.readline() for i in range(self.nheaderFile)] - raw = [line.split(self.splitChar) for line in self.input_fhandle] + raw = [line.split(splitChar) for line in self.inputFhandle] - nul = self.data.add_bulk(raw) + nul = self.data.addBulkFromTxt(raw) except: pass finally: - self.input_fhandle.close() + self.inputFhandle.close() - def read(self): + def read(self, splitChar=","): '''Read ICARTT data and header ''' - self.read_header() - self.end_define_mode() - self.read_data() + self.readHeader(splitChar) + self.endDefineMode(splitChar) + self.readData(splitChar) def __del__(self): try: - if not self.input_fhandle.closed: - self.input_fhandle.close() + if not self.inputFhandle.closed: + self.inputFhandle.close() except: pass - def end_define_mode(self): - self.DEFINE_MODE = False + def endDefineMode(self): + '''Fixes the variables structure of the dataset. Sets up the data store, + so data can be added. Needs to be called after variable definition + and before adding data. + ''' + self.DefineMode = False # create data store - if self.format == Formats.FFI_1001: - self.data = DataStore_1001(self.Independent_Variable, self.Dependent_Variables ) - elif self.format == Formats.FFI_2110: - self.data = DataStore_2110(self.Independent_Variable, self.Independent_Bounded_Variable, self.Auxiliary_Variables, self.Dependent_Variables ) + if self.format == Formats.FFI1001: + self.data = DataStore1001(self.independentVariable, self.dependentVariables ) + elif self.format == Formats.FFI2110: + self.data = DataStore2110(self.independentVariable, self.independentBoundedVariable, self.auxiliaryVariables, self.dependentVariables ) - def __init__(self, f=None, loadData=True, splitChar=",", format=Formats.FFI_1001): + def __init__(self, f=None, loadData=True, splitChar=",", format=Formats.FFI1001): '''Constructor method ''' self.format = format @@ -760,42 +730,41 @@ class Dataset: self.revision = 0 self.launch = None - self.File_Volume_Number = 1 - self.Total_Number_Of_File_Volumes = 1 - - self.PI_name = 'Mustermann, Martin' - self.PI_affiliation = 'Musterinstitut' - self.Data_Source_Description = 'Musterdatenprodukt' - self.Mission_Name = 'MUSTEREX' - self.Date_Collection = datetime.datetime.today() - self.Revision_Date = datetime.datetime.today() - self.Data_Interval_Code = [ 0.0 ] - self.Independent_Variable = None - self.Independent_Bounded_Variable = None - self.Auxiliary_Variables = {} - self.Dependent_Variables = {} - - self.Special_Comments = [] - self.Normal_Comments = StandardNormalComments() + self.fileVolumeNumber = 1 + self.totalNumberOfFileVolumes = 1 + + self.PIName = 'Mustermann, Martin' + self.PIAffiliation = 'Musterinstitut' + self.dataSourceDescription = 'Musterdatenprodukt' + self.missionName = 'MUSTEREX' + self.dateOfCollection = datetime.datetime.today() + self.dateOfRevision = datetime.datetime.today() + self.dataIntervalCode = [ 0.0 ] + + self.independentVariable = None + self.independentBoundedVariable = None + self.auxiliaryVariables = {} + self.dependentVariables = {} + + self.specialComments = [] + self.normalComments = StandardNormalComments() # Standard v2.0 for normal comments requires all keywords present, # might not be the case - then reading data will fail - self.nheader_file = -1 - - self.splitChar = splitChar + self.nheaderFile = -1 self.data = None - self.DEFINE_MODE = True + self.DefineMode = True # read data if f is not None if f is not None: if isinstance(f, str): - self.input_fhandle = open(f, 'r') + self.inputFhandle = open(f, 'r') else: - self.input_fhandle = f + self.inputFhandle = f - self.read_header() + self.readHeader(splitChar) if loadData: - self.end_define_mode() - self.read_data() + self.endDefineMode() + self.readData(splitChar) -- GitLab