I have many large (>35,000,000) lists of integers that will contain duplicates. I need to get a count for each integer in a list. The following code works, but seems slow. Can anyone else better the benchmark using Python and preferably NumPy?
def group():
import numpy as np
from itertools import groupby
values = np.array(np.random.randint(0,1<<32, size=35000000), dtype='u4')
values.sort()
groups = ((k, len(list(g))) for k,g in groupby(values))
index = np.fromiter(groups, dtype='u4,u2')
if __name__=='__main__':
from timeit import Timer
t = Timer("group()","from __main__ import group")
print t.timeit(number=1)
which returns:
$ python bench.py
111.377498865
Based on responses:
def group_original():
import numpy as np
from itertools import groupby
values = np.array(np.random.randint(0, 1<<32, size=35000000), dtype='u4')
values.sort()
groups = ((k, len(list(g))) for k,g in groupby(values))
index = np.fromiter(groups, dtype='u4,u2')
def group_gnibbler():
import numpy as np
from itertools import groupby
values = np.array(np.random.randint(0, 1<<32, size=35000000), dtype='u4')
values.sort()
groups = ((k,sum(1 for i in g)) for k,g in groupby(values))
index = np.fromiter(groups, dtype='u4,u2')
def group_christophe():
import numpy as np
values = np.array(np.random.randint(0, 1<<32, size=35000000), dtype='u4')
values.sort()
counts=values.searchsorted(values, side='right') - values.searchsorted(values, side='left')
index = np.zeros(len(values), dtype='u4,u2')
index['f0'] = values
index['f1'] = counts
# Erroneous result!
def group_paul():
import numpy as np
values = np.array(np.random.randint(0, 1<<32, size=35000000), dtype='u4')
values.sort()
diff = np.concatenate(([1], np.diff(values)))
idx = np.concatenate((np.where(diff)[0], [len(values)]))
index = np.empty(len(idx)-1, dtype='u4,u2')
index['f0'] = values[idx[:-1]]
index['f1'] = np.diff(idx)
if __name__=='__main__':
from timeit import Timer
timings=[
("group_original", "Original"),
("group_gnibbler", "Gnibbler"),
("group_christophe", "Christophe"),
("group_paul", "Paul"),
]
for method,title in timings:
t = Timer("%s()"%method,"from __main__ import %s"%method)
print "%s: %s secs"%(title, t.timeit(number=1))
which returns:
$ python bench.py
Original: 113.385262966 secs
Gnibbler: 71.7464978695 secs
Christophe: 27.1690568924 secs
Paul: 9.06268405914 secs
Although Christophe gives incorrect results currently.
See Question&Answers more detail:os