Newer
Older
import collections
import copy
import math
from functools import total_ordering
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
IMPLEMENTED_FORMATS = [ 1001, 2110 ]
class SimpleNormalComments(collections.UserList):
@property
def data(self):
return self.contents
def __init__(self, contents=[]):
self.contents = contents
class StandardNormalComments(SimpleNormalComments):
@property
def data(self):
return [ k + ": " + str(v) for k, v in self.keywords.items() ] + self.freeform + [ self.shortnames ]
def ingest(self, raw):
# last line is always shortname
self.shortnames = raw.pop()
# rest is either keyword, or free form
for l in raw:
is_keyword = False
for k in self.keywords:
if l.startswith(k + ":"):
self.keywords[k] = l.replace(k + ":", "").strip()
is_keyword = True
break
if not is_keyword:
self.freeform.append(l)
def __init__(self, contents=[]):
self.freeform = []
self.keywords = {
"PI_CONTACT_INFO": "N/A",
"PLATFORM": "N/A",
"LOCATION": "N/A",
"ASSOCIATED_DATA": "N/A",
"INSTRUMENT_INFO": "N/A",
"DATA_INFO": "N/A",
"UNCERTAINTY": -1,
"ULOD_FLAG": "N/A",
"ULOD_VALUE": "N/A",
"LLOD_FLAG": "N/A",
"LLOD_VALUE": "N/A",
"DM_CONTACT_INFO": "N/A",
"PROJECT_INFO": "N/A",
"STIPULATIONS_ON_USE": "N/A",
"OTHER_COMMENTS": "N/A",
"REVISION": 0,
"R0": "N/A"
}
self.shortnames = []
if not contents is []:
self.ingest(contents)
class Data_1001(collections.UserList):
def write(self, prnt=lambda x: sys.stdout.write(x)):
for line in zip(zip(self.ivar), *self.dvar):
prnt([ line[0][0] ] + [ x[1] for x in line[1:] ])
def extract_items(self, raw):
for cur in range(len(raw)):
self.ivar.append_value_from_string_ivar(raw[cur][0])
nul = [ var.append_value_from_string(raw[cur][0], raw[cur][i+1]) for i, var in enumerate(self.dvar) ]
def __init__(self, raw=[], ivar=None, dvar=None):
self.ivar = ivar
self.dvar = dvar
#
self.extract_items(raw)
class Data_2110(Data_1001):
def write(self, prnt=lambda x: sys.stdout.write(x)):
for ivar in self.ivar:
prnt( [ ivar ] + [ a[1] for auxvar in self.auxvar for a in auxvar if a[0] == ivar ] )
for ibvar in [ b[0][1] for b in self.dvar[0] if b[0][0] == ivar ]:
prnt([ ibvar ] + [ d[1] for dvar in self.dvar for d in dvar if (d[0][0] == ivar) and (d[0][1] == ibvar) ])
def extract_items(self, raw):
cur = 0
while cur < len(raw):
self.ivar.append_value_from_string_ivar(raw[cur][0])
nul = [ var.append_value_from_string_ibvar(raw[cur][0], raw[cur][i+1]) for i, var in enumerate(self.auxvar) ]
nprimary = int(self.auxvar[0][-0][1])
for i in range(nprimary):
self.ibvar.append_value_from_string_ibvar(raw[cur][0], raw[i+1][0])
nul = [ var.append_value_from_string(raw[cur][0], raw[i+1][0], raw[i+1][j+1]) for j, var in enumerate(self.dvar) ]
cur += 1 + nprimary
def __init__(self, raw=[], ivar=None, ibvar=None, auxvar=None, dvar=None):
self.ivar = ivar
self.ibvar = ibvar
self.auxvar = auxvar
self.dvar = dvar
#
self.extract_items(raw)
class Variable_1001(collections.UserList):
A Variable is a ICARTT variable description with name, units, scale and missing value.
'''
Return variable description string as it appears in an ICARTT file
'''
toplot = [self.name, self.units ]
if self.units != self.longname:
toplot += [ self.longname ]
return self.splitChar.join( toplot )
def __add__(self, other):
# for doing 'data += something'
for item in other:
self.data.append(item)
return self
def _sanitize(self, x):
return float(x.strip().replace(self.miss, 'NaN'))
def append_value_from_string_ivar(self, ivar):
self.data.append(self._sanitize(ivar))
def append_value_from_string(self, ivar, dvar):
self.data.append( ( self._sanitize(ivar), self._sanitize(dvar) ) )
def __init__(self, name, units, longname, values=[], scale=1.0, miss=-99999, splitChar=","):
self.units = units
#: Long name
self.longname = longname
#: Values
self.data = []
#: Missing value (string, just as it appears in the ICARTT file)
self.splitChar = splitChar
class Variable_2110(Variable_1001):
def append_value_from_string_ibvar(self, ivar, ibvar):
self.data.append( ( self._sanitize(ivar) , self._sanitize(ibvar) ) )
def append_value_from_string(self, ivar, ibvar, dvar):
self.data.append( ( ( self._sanitize(ivar), self._sanitize(ibvar) ) , self._sanitize(dvar) ) )
'''
An ICARTT dataset that can be created from scratch or read from a file,
manipulated, and then written to a file.
'''
total = -1
if self.format == 1001:
total = 14 + self.ndvar + self.nscom + self.nncom
total = 16 + self.nivar + self.nauxvar + self.ndvar + self.nscom + self.nncom
def nivar(self):
'''
Independent variable count
'''
return 1 + (0 if self.IBVAR is None else 1)
@property
vars = [ self.IVAR ] + self.DVAR
if self.format == 2110:
vars = [ self.IBVAR ] + vars + self.AUXVAR
return vars
'''
Names of variables (independent and dependent)
'''
@property
def times(self):
'''
Time steps of the data contained.
'''
return [ self.dateValid + datetime.timedelta(seconds=x) for x in self[self.IVAR.name] ]
def __getitem__(self, name):
'''
Convenience function to access variable data by name::
ict = icartt.Dataset(<fname>)
ict['O3']
'''
var = [ x for x in self.VAR if x.name == name ]
if not len(var) == 1:
raise Exception("{:s} not found in data".format(name))
res = [ '' ]
return res[0]
def index(self, name):
'''
Index of variable <name> in data array
'''
res = [ i for i, x in enumerate(self.VAR) if x.name == name ]
def prnt(txt):
f.write(str(txt) + "\n")
# Number of lines in header, file format index (most files use 1001) - comma delimited.
prnt("{:d}, {:d}".format(self.nheader, self.format))
# PI last name, first name/initial.
prnt(self.PI)
# Organization/affiliation of PI.
prnt(self.organization)
# Data source description (e.g., instrument name, platform name, model name, etc.).
prnt(self.dataSource)
# Mission name (usually the mission acronym).
prnt(self.mission)
# File volume number, number of file volumes (these integer values are used when the data require more than one file per day; for data that require only one file these values are set to 1, 1) - comma delimited.
prnt(self.splitChar.join([ str(self.volume), str(self.nvolumes) ]))
# UTC date when data begin, UTC date of data reduction or revision - comma delimited (yyyy, mm, dd, yyyy, mm, dd).
prnt(self.splitChar.join([ datetime.datetime.strftime(x, self.splitChar.join(["%Y","%m","%d"])) for x in [ self.dateValid, self.dateRevised ] ]))
# Data Interval (This value describes the time spacing (in seconds) between consecutive data records. It is the (constant) interval between values of the independent variable. For 1 Hz data the data interval value is 1 and for 10 Hz data the value is 0.1. All intervals longer than 1 second must be reported as Start and Stop times, and the Data Interval value is set to 0. The Mid-point time is required when it is not at the average of Start and Stop times. For additional information see Section 2.5 below.).
if self.format == 2110:
# Description or name of independent (bound) variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.).
prnt(self.IBVAR.desc)
# Description or name of independent variable (This is the name chosen for the start time. It always refers to the number of seconds UTC from the start of the day on which measurements began. It should be noted here that the independent variable should monotonically increase even when crossing over to a second day.).
prnt(self.IVAR.desc)
# Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.).
prnt(self.ndvar)
# Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited.
prnt(self.splitChar.join( [ "{:.1g}".format(x.scale) for x in self.DVAR ]))
# Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited.
prnt(self.splitChar.join( [ str(x.miss) for x in self.DVAR ]))
# Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.).
nul = [ prnt(x.desc) for x in self.DVAR ]
if self.format == 2110:
# Number of variables (Integer value showing the number of dependent variables: the total number of columns of data is this value plus one.).
prnt(self.nauxvar)
# Scale factors (1 for most cases, except where grossly inconvenient) - comma delimited.
prnt(self.splitChar.join( [ "{:.1g}".format(x.scale) for x in self.AUXVAR ]))
# Missing data indicators (This is -9999 (or -99999, etc.) for any missing data condition, except for the main time (independent) variable which is never missing) - comma delimited.
prnt(self.splitChar.join( [ str(x.miss) for x in self.AUXVAR ]))
# Variable names and units (Short variable name and units are required, and optional long descriptive name, in that order, and separated by commas. If the variable is unitless, enter the keyword "none" for its units. Each short variable name and units (and optional long name) are entered on one line. The short variable name must correspond exactly to the name used for that variable as a column header, i.e., the last header line prior to start of data.).
nul = [ prnt(x.desc) for x in self.AUXVAR ]
# Number of SPECIAL comment lines (Integer value indicating the number of lines of special comments, NOT including this line.).
prnt("{:d}".format(self.nscom))
# Special comments (Notes of problems or special circumstances unique to this file. An example would be comments/problems associated with a particular flight.).
nul = [ prnt(x) for x in self.SCOM ]
# Number of Normal comments (i.e., number of additional lines of SUPPORTING information: Integer value indicating the number of lines of additional information, NOT including this line.).
prnt("{:d}".format(self.nncom))
# Normal comments (SUPPORTING information: This is the place for investigators to more completely describe the data and measurement parameters. The supporting information structure is described below as a list of key word: value pairs. Specifically include here information on the platform used, the geo-location of data, measurement technique, and data revision comments. Note the non-optional information regarding uncertainty, the upper limit of detection (ULOD) and the lower limit of detection (LLOD) for each measured variable. The ULOD and LLOD are the values, in the same units as the measurements that correspond to the flags -7777s and -8888s within the data, respectively. The last line of this section should contain all the short variable names on one line. The key words in this section are written in BOLD below and must appear in this section of the header along with the relevant data listed after the colon. For key words where information is not needed or applicable, simply enter N/A.).
nul = [ prnt(x) for x in self.NCOM ]
def prnt_data(vars):
prnt( self.splitChar.join([ str(x) for x in vars ]) )
nul = self.data.write(prnt=prnt_data)
'''
Create ICARTT-compliant file name based on the information contained in the dataset
'''
fn = self.dataID + "_" +self.locationID + "_" +datetime.datetime.strftime(self.dateValid, date_format)
fn += "_R" + str(self.revision) if not self.revision is None else ""
fn += "_L" + str(self.launch) if not self.launch is None else ""
fn += "_V" + str(self.volume) if self.nvolumes > 1 else ""
return fn + ".ict"
'''
Read the ICARTT header (from file)
'''
class Filehandle_with_linecounter:
def __init__(self, f, splitChar):
self.f = f
self.line = 0
self.splitChar = splitChar
def readline(self, do_split=True):
self.line += 1
dmp = self.f.readline().replace('\n', '').replace('\r','')
if do_split:
dmp = [ word.strip(' ') for word in dmp.split(self.splitChar) ]
return dmp
if self.input_fhandle.closed:
self.input_fhandle = open(self.input_fhandle.name)
f = Filehandle_with_linecounter(self.input_fhandle, self.splitChar)
# line 1 - Number of lines in header, file format index (most files use
# 1001) - comma delimited.
dmp = f.readline()
self.format = int(dmp[1])
if len(dmp) > 2:
self.version = dmp[2]
assert self.format in IMPLEMENTED_FORMATS, "ICARTT format {:d} not implemented".format(self.format)
# line 4 - Data source description (e.g., instrument name, platform name,
# model name, etc.).
# line 5 - Mission name (usually the mission acronym).
# line 6 - File volume number, number of file volumes (these integer values
# are used when the data require more than one file per day; for data that
# require only one file these values are set to 1, 1) - comma delimited.
dmp = f.readline()
self.volume = int(dmp[0])
self.nvolumes = int(dmp[1])
# line 7 - UTC date when data begin, UTC date of data reduction or revision
# - comma delimited (yyyy, mm, dd, yyyy, mm, dd).
self.dateValid = datetime.datetime.strptime("".join([ "{:s}".format(x) for x in dmp[0:3] ]), '%Y%m%d')
self.dateRevised = datetime.datetime.strptime("".join([ "{:s}".format(x) for x in dmp[3:6] ]), '%Y%m%d')
# line 8 - Data Interval (This value describes the time spacing (in seconds)
# between consecutive data records. It is the (constant) interval between
# values of the independent variable. For 1 Hz data the data interval value
# is 1 and for 10 Hz data the value is 0.1. All intervals longer than 1
# second must be reported as Start and Stop times, and the Data Interval
# value is set to 0. The Mid-point time is required when it is not at the
# average of Start and Stop times. For additional information see Section
# 2.5 below.).
self.dataInterval = float(f.readline()[0])
Variable = Variable_1001 if self.format == 1001 else Variable_2110
# line 9 - Description or name of independent variable (This is the name
# chosen for the start time. It always refers to the number of seconds UTC
# from the start of the day on which measurements began. It should be noted
# here that the independent variable should monotonically increase even when
# crossing over to a second day.
if self.format == 2110:
dmp = f.readline()
self.IBVAR = Variable(dmp[0], dmp[1], dmp[2 if len(dmp) > 2 else 1], splitChar=self.splitChar)
dmp = f.readline()
self.IVAR = Variable(dmp[0], dmp[1], dmp[2 if len(dmp) > 2 else 1], splitChar=self.splitChar)
# line 10 - Number of variables (Integer value showing the number of
# dependent variables: the total number of columns of data is this value
# plus one.).
# line 11- Scale factors (1 for most cases, except where grossly
# inconvenient) - comma delimited.
# line 12 - Missing data indicators (This is -9999 (or -99999, etc.) for
# any missing data condition, except for the main time (independent)
# variable which is never missing) - comma delimited.
# no float casting here, as we need to do string comparison lateron when reading data...
# line 13 - Variable names and units (Short variable name and units are
# required, and optional long descriptive name, in that order, and separated
# by commas. If the variable is unitless, enter the keyword "none" for its
# units. Each short variable name and units (and optional long name) are
# entered on one line. The short variable name must correspond exactly to
# the name used for that variable as a column header, i.e., the last header
# line prior to start of data.).
dmp = f.readline()
dvname += [ dmp[0] ]
dvunits += [ dmp[1] ]
dvlongname += [ dmp[2 if len(dmp) > 2 else 1] ]
self.DVAR = [ Variable(name, unit, longname, scale=scale, miss=miss, splitChar=self.splitChar) for name, unit, longname, scale, miss in zip(dvname, dvunits, dvlongname, dvscale, dvmiss) ]
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
if self.format == 2110:
# line 10 - Number of variables (Integer value showing the number of
# dependent variables: the total number of columns of data is this value
# plus one.).
navar = int(f.readline()[0])
# line 11- Scale factors (1 for most cases, except where grossly
# inconvenient) - comma delimited.
avscale = [ float(x) for x in f.readline() ]
# line 12 - Missing data indicators (This is -9999 (or -99999, etc.) for
# any missing data condition, except for the main time (independent)
# variable which is never missing) - comma delimited.
avmiss = [ x for x in f.readline() ]
# no float casting here, as we need to do string comparison lateron when reading data...
# line 13 - Variable names and units (Short variable name and units are
# required, and optional long descriptive name, in that order, and separated
# by commas. If the variable is unitless, enter the keyword "none" for its
# units. Each short variable name and units (and optional long name) are
# entered on one line. The short variable name must correspond exactly to
# the name used for that variable as a column header, i.e., the last header
# line prior to start of data.).
dmp = f.readline()
avname = [ dmp[0] ]
avunits = [ dmp[1] ]
avlongname = [ dmp[2 if len(dmp) > 2 else 1] ]
for i in range(1, navar):
dmp = f.readline()
avname += [ dmp[0] ]
avunits += [ dmp[1] ]
avlongname += [ dmp[2 if len(dmp) > 2 else 1] ]
self.AUXVAR = [ Variable(name, unit, longname, scale=scale, miss=miss, splitChar=self.splitChar) for name, unit, longname, scale, miss in zip(avname, avunits, avlongname, avscale, avmiss) ]
# line 14 + nvar - Number of SPECIAL comment lines (Integer value
# indicating the number of lines of special comments, NOT including this
# line.).
# line 15 + nvar - Special comments (Notes of problems or special
# circumstances unique to this file. An example would be comments/problems
# associated with a particular flight.).
self.SCOM = [ f.readline(do_split=False) for i in range(0, nscom) ]
# line 16 + nvar + nscom - Number of Normal comments (i.e., number of
# additional lines of SUPPORTING information: Integer value indicating the
# number of lines of additional information, NOT including this line.).
# line 17 + nvar + nscom - Normal comments (SUPPORTING information: This is
# the place for investigators to more completely describe the data and
# measurement parameters. The supporting information structure is described
# below as a list of key word: value pairs. Specifically include here
# information on the platform used, the geo-location of data, measurement
# technique, and data revision comments. Note the non-optional information
# regarding uncertainty, the upper limit of detection (ULOD) and the lower
# limit of detection (LLOD) for each measured variable. The ULOD and LLOD
# are the values, in the same units as the measurements that correspond to
# the flags -7777's and -8888's within the data, respectively. The last line
# of this section should contain all the "short" variable names on one line.
# The key words in this section are written in BOLD below and must appear in
# this section of the header along with the relevant data listed after the
# colon. For key words where information is not needed or applicable, simply
# enter N/A.).
raw_ncom = [ f.readline(do_split=False) for i in range(0, nncom) ]
try:
self.NCOM = StandardNormalComments(raw_ncom)
except:
warnings.warn("Normal comments do not adhere to ICARTT v2.0 standard.")
self.NCOM = SimpleNormalComments(raw_ncom)
self.nheader_file = f.line
del f
self.input_fhandle.close()
def read_data(self):
'''
Read ICARTT data (from file)
'''
if self.input_fhandle.closed:
self.input_fhandle = open(self.input_fhandle.name)
nul = [ self.input_fhandle.readline() for i in range(self.nheader_file) ]
if self.format == 1001:
self.data = Data_1001([ line.split(self.splitChar) for line in self.input_fhandle ], ivar=self.IVAR, dvar=self.DVAR)
elif self.format == 2110:
self.data = Data_2110([ line.split(self.splitChar) for line in self.input_fhandle ], ivar=self.IVAR, ibvar=self.IBVAR, auxvar=self.AUXVAR, dvar=self.DVAR)
else:
print("Unknown format")
self.input_fhandle.close()
def read_first_and_last(self):
'''
Read first and last ICARTT data line (from file). Useful for quick estimates e.g. of the time extent
of big ICARTT files, without having to read the whole thing, which would be slow.
'''
if self.input_fhandle.closed:
self.input_fhandle = open(self.input_fhandle.name)
nul = [ self.input_fhandle.readline() for i in range(self.nheader_file) ]
self.data = Data_1001([ first.split(self.splitChar) ], ivar=self.IVAR, dvar=self.DVAR)
for line in self.input_fhandle:
pass
last = line
def __init__(self, f=None, loadData=True, splitChar=","):
'''
:param string/file f: file path or file object to use
:param bool loadData: load data as well (or only header if False)?
:param string splitChar: the splitting character used to separate fields in a line
'''
self.revision = 0
self.launch = None
self.volume = 1
self.nvolumes = 1
self.PI = 'Mustermann, Martin'
self.organization = 'Musterinstitut'
self.dataSource = 'Musterdatenprodukt'
self.mission = 'MUSTEREX'
self.dateValid = datetime.datetime.today()
self.dateRevised = datetime.datetime.today()
self.dataInterval = 0.0
self.IVAR = Variable_1001('Time_Start',
'seconds_from_0_hours_on_valid_date',
'seconds_from_0_hours_on_valid_date',
scale=1.0, miss=-9999999, splitChar=splitChar),
Variable_1001('Some_Variable',
'ppbv',
# Standard v2.0 for normal comments requires all keywords present,
# might not be the case - then reading data will fail
self.nheader_file = -1
self.input_fhandle = open(f, 'r')
else:
self.input_fhandle = f
self.read_header()
if loadData:
self.read_data()