Python collections中的Counter

Counter是dict的子类,是一个简单的计数器

1
2
3
4
5
from collections import Counter
c = Counter()
for ch in 'programming':
c[ch] = c[ch] + 1
c # Counter({'g': 2, 'm': 2, 'r': 2, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'p': 1})

counter有多高效?
counter为什么这么高效?

速度测评

以下计数,发现并未对速度做优化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from time import time
import tensorflow as tf
from collections import Counter


def _read_words(filename):
with tf.gfile.GFile(filename, "r") as f:
return f.read().decode("utf-8").replace("\n", "<eos>").split()


# 计数 + 排序
def _build_vocab(filename):
data = _read_words(filename)

start = time()
# 利用counter,计数+排序。counter并不快
counter = Counter(data)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))

# 自己计数
# word_cnt = count(data)
# count_pairs = sorted(word_cnt.items(), key=lambda x: (-x[1], x[0]))

stop = time()
print("Stop: " + str(stop))
print(str(stop - start) + "秒")


words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
return word_to_id


def count(data):
word_cnt = {}
for w in data:
if w in word_cnt:
word_cnt[w] += 1
else:
word_cnt[w] = 1
return word_cnt


word_to_id = _build_vocab("ptb_data/ptb.train.txt")
print(word_to_id["no"])