Mercurial > hg > numerics
annotate numerics/read.py @ 80:8bfa28ff74ce
use that thing we made
| author | Jeff Hammel <k0scist@gmail.com> |
|---|---|
| date | Sun, 01 Mar 2015 09:29:38 -0800 |
| parents | ef915968d104 |
| children | b7d4b7f84883 |
| rev | line source |
|---|---|
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
1 #!/usr/bin/env python |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
2 # -*- coding: utf-8 -*- |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
3 |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
4 """ |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
5 read CSV, etc |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
6 """ |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
7 |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
8 # imports |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
9 import argparse |
| 24 | 10 import csv |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
11 import os |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
12 import sys |
| 80 | 13 from .write import CSVWriter |
|
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
14 |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
15 # module globals |
| 26 | 16 __all__ = ['main', 'CSVParser'] |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
17 string = (str, unicode) |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
18 |
|
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
19 |
| 20 | 20 class CSVSchema(object): |
| 21 """read CSV with a schema""" | |
| 22 | |
| 23 def __init__(self, columns): | |
| 24 self.columns = columns | |
| 25 | |
| 26 def read(self, f): | |
| 27 | |
| 28 if isinstance(f, string): | |
| 29 with open(f) as fp: | |
| 30 return self.read(fp) | |
| 31 | |
| 32 retval = [] | |
| 33 reader = csv.reader(f) | |
| 34 for row in reader: | |
| 35 retval.append(dict(zip(self.columns, row))) | |
| 36 return retval | |
| 37 | |
| 38 __call__ = read | |
| 39 | |
| 40 | |
| 41 def aggregate_columns(directory, schema): | |
| 42 | |
| 43 # check for missing files | |
| 44 missing = [path for path in schema | |
| 45 if not os.path.exists(os.path.join(directory, path))] | |
| 46 assert not missing, "Missing files: {}".format(', '.join(missing)) | |
| 47 | |
| 48 # read records | |
| 49 records = {filename: CSVSchema(columns).read(os.path.join(directory, filename)) | |
| 50 for filename, columns in schema.items()} | |
| 51 | |
| 52 | |
| 53 # check lengths | |
| 54 lengths = [len(value) for value in records.values()] | |
| 55 assert len(set(lengths)) == 1, "Differing lengths found for files" | |
| 56 | |
| 57 # build new rows | |
| 58 retval = [] | |
| 59 for row in zip(*records.values()): | |
| 60 new_row = {} | |
| 61 for record in row: | |
| 62 for key, value in record.items(): | |
| 63 if new_row.get(key, value) != value: | |
| 64 raise AssertionError("{} != {}".format(new_row.get(key), value)) | |
| 65 new_row[key] = value | |
| 66 retval.append(new_row) | |
| 67 | |
| 68 return retval | |
| 69 | |
| 70 | |
| 11 | 71 def read_csv(*fp): |
| 19 | 72 """read a series of CSV files""" |
| 73 | |
| 11 | 74 retval = [] |
| 75 for f in fp: | |
| 76 | |
| 77 if isinstance(f, string): | |
| 78 with open(f) as _f: | |
| 79 retval.extend(read_csv(_f)) | |
| 80 continue | |
| 81 | |
| 82 reader = csv.reader(f) | |
| 83 retval.extend([row for row in reader]) | |
| 84 | |
| 85 return retval | |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
86 |
|
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
87 |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
88 class CSVParser(argparse.ArgumentParser): |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
89 """CLI option parser""" |
| 28 | 90 |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
91 def __init__(self, **kwargs): |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
92 kwargs.setdefault('description', __doc__) |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
93 argparse.ArgumentParser.__init__(self, **kwargs) |
|
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
94 self.add_argument('csv', nargs='*', |
|
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
95 help="CSV files to read, or read from stdin") |
| 25 | 96 self.add_argument('-+', '--add', dest='added_columns', nargs='+', |
| 97 help="append this column") | |
| 28 | 98 self.add_argument('-c', '--col', '--columns', dest='columns', |
| 99 nargs='+', type=int, | |
| 100 help="column numbers to output, starting with 0") | |
| 25 | 101 self.add_argument('-o', '--output', dest='output', |
| 102 type=argparse.FileType('a'), default=sys.stdout, | |
| 103 help='output destination, or stdout') | |
|
30
75270e7a051b
add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents:
28
diff
changeset
|
104 self.add_argument('--index', dest='index', |
|
75270e7a051b
add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents:
28
diff
changeset
|
105 action='store_true', default=False, |
|
75270e7a051b
add ability to add an index and fix a few bugs
Jeff Hammel <k0scist@gmail.com>
parents:
28
diff
changeset
|
106 help="prepend each row with numeric index") |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
107 self.options = None |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
108 |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
109 def parse_args(self, *args, **kw): |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
110 options = argparse.ArgumentParser.parse_args(self, *args, **kw) |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
111 self.validate(options) |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
112 self.options = options |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
113 return options |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
114 |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
115 def validate(self, options): |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
116 """validate options""" |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
117 |
|
45
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
118 def read(self): |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
119 """read and process CSV""" |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
120 |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
121 data = read_csv(*self.options.csv) |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
122 |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
123 if self.options.added_columns: |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
124 # add columns |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
125 for row in data: |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
126 row.extend(options.added_columns) |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
127 |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
128 if self.options.columns: |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
129 # filter by column |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
130 data = [[row[column] for column in self.options.columns] |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
131 for row in data] |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
132 |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
133 if self.options.index: |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
134 # prepend numeric index |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
135 for index, row in enumerate(data): |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
136 row.insert(0, index) |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
137 |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
138 # return processed data |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
139 return data |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
140 |
|
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
141 |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
142 def main(args=sys.argv[1:]): |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
143 """CLI""" |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
144 |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
145 # parse command line options |
| 24 | 146 parser = CSVParser() |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
147 options = parser.parse_args(args) |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
148 |
|
43
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
149 if not options.csv: |
|
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
150 # read from stdin |
|
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
151 options.csv = [sys.stdin] |
|
bcf9ec537bda
read from stdin if no files specified
Jeff Hammel <k0scist@gmail.com>
parents:
30
diff
changeset
|
152 |
| 11 | 153 # read CSV |
|
45
ef915968d104
put this in the parser so that i can use this in convert
Jeff Hammel <k0scist@gmail.com>
parents:
43
diff
changeset
|
154 data = parser.read() |
| 28 | 155 |
| 25 | 156 # write CSV |
| 80 | 157 writer = CSVWriter(options.output) |
| 158 writer.write(data) | |
| 159 | |
| 11 | 160 |
|
5
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
161 if __name__ == '__main__': |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
162 main() |
|
d5447d401c44
serializaion; pandas probably does this
Jeff Hammel <k0scist@gmail.com>
parents:
diff
changeset
|
163 |
