If your input is in csv-format, you could use the csv module.
The main point by grouping is, that you have to sort it before.
The functions
sorted and
itertools.groupby have key functions, which are called.
A helper function is
operator.itemgetter to get items by index or name.
from operator import itemgetter data = [(1,10), (2, 0), (3, -10), (4, -20)] print(sorted(data, key=itemgetter(1))) # sort by second item
Output:
[(4, -20), (3, -10), (2, 0), (1, 10)]
Now let's do it with data for grouping:
from operator import itemgetter from itertools import groupby data = [ (10,20,'foo'), (10,30,'bar'), (10,10,'fizz'), (1,3,'bar'), (1,3,'foo'), ] getter = itemgetter(2) # getter for third field sorted_list = sorted(data, key=getter) print(sorted_list)
Output:
[(10, 30, 'bar'), (1, 3, 'bar'), (10, 10, 'fizz'), (10, 20, 'foo'), (1, 3, 'foo')]
Next step is the grouping:
from operator import itemgetter from itertools import groupby data = [ (10,20,'foo'), (10,30,'bar'), (10,10,'fizz'), (1,3,'bar'), (1,3,'foo'), ] getter = itemgetter(2) # getter for third field sorted_list = sorted(data, key=getter) grouped = groupby(sorted_list, getter) for group, elements in grouped: print('Group ->', group) for element in elements: print(element)Output:
Group -> bar (10, 30, 'bar') (1, 3, 'bar') Group -> fizz (10, 10, 'fizz') Group -> foo (10, 20, 'foo') (1, 3, 'foo')
If you don't sort, following happens:
from operator import itemgetter from itertools import groupby data = [ (10,20,'foo'), (10,30,'bar'), (10,10,'fizz'), (1,3,'bar'), (1,3,'foo'), ] getter = itemgetter(2) # getter for third field grouped = groupby(data, getter) for group, elements in grouped: print('Group ->', group) for element in elements: print(element)Output:
Group -> foo (10, 20, 'foo') Group -> bar (10, 30, 'bar') Group -> fizz (10, 10, 'fizz') Group -> bar (1, 3, 'bar') Group -> foo (1, 3, 'foo')
Now the whole implementation with some tricks:
#!/usr/bin/env python3 """ Group a csv file by a given field. """ import sys from argparse import ArgumentParser from itertools import groupby from operator import itemgetter from pathlib import Path from csv import reader, writer def group_csv_reader(file, fields): with file.open() as fd: csv = reader(fd) header = next(csv) for field in fields: if field not in header: raise ValueError(f'Only following fields are valid to group: {header}') indicies = [header.index(field) for field in fields] getter = itemgetter(*indicies) # the itemgetter can get one or more items from a sequence or mapping yield header csv = sorted(csv, key=getter) # remove sorted, to see the effect # this will consome the whole csv file # and sort it. After the sorting # the whole content of the file is in memory # if the file is smaller than 1GiB, # it's ok not to be memory efficient # now the file is closed and # csv still contains the sorted rows by field csv = groupby(csv, key=getter) # groupby itself does not sort # it just yield the group and rows for group, rows in csv: for row in rows: yield row def main(): parser = ArgumentParser(description=__doc__) parser.add_argument('file', type=Path, help='CSV input file to process') # file is a Path object you can work with parser.add_argument('fields', nargs='+', help='Fieldname(s) to group by') parser.add_argument('-o', type=Path, help='Output to a file') # o is a boolean # one or more fieldname args = parser.parse_args() if args.o: fout = args.o.open('w') else: fout = sys.stdout # now make a csv_writer, wich uses currently sys.stdout csv_writer = writer(fout) # catching the error, if a field was # given which does not exist try: iterator = group_csv_reader(args.file, args.fields) except ValueError as e: print(e, file=sys.stderr) fout.close() return 1 for row in iterator: csv_writer.writerow(row) return 0 if __name__ == '__main__': sys.exit(main())