>>> obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd']) >>> obj a 0.0 b 1.0 c 2.0 d 3.0 dtype: float64 >>> obj['b'] 1.0 >>> obj[1] 1.0 >>> obj[:2] a 0.0 b 1.0 dtype: float64 >>> obj[obj < 2] a 0.0 b 1.0 dtype: float64
# 如果使用 label 索引,则索引区间为闭区间 >>> obj['b':'c'] b 1.0 c 2.0 dtype: float64
5. DataFrame索引
先贴张DataFrame索引方法的表格,摘录自《Python for Data Analysis》。
Type
Notes
df[val]
Select single column or sequence of columns from the DataFrame; special case conveniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values bases on some criterion)
df.loc[val]
Selects single row or subset of rows from the DataFrame by label
df.loc[:, val]
Selects single column or subset of columns by label
df.loc[val1, val2]
Select both rows and columns by label
df.iloc[where]
Selects single row of subsets of rows from the DataFrame by integer position
df.iloc[:, where]
Selects single column or subset of columns by integer position
df.iloc[where_i, where_j]
Select both rows and columns by integer position
df.at[label_i, label_j]
Select a single scalar value by row and column label
df.iat[i, j]
Select a single scalar value by row and column position (integers)
先创建一个数据
1 2 3 4 5 6 7 8 9
>>> data = pd.DataFrame(np.arange(16).reshape((4, 4)), ... index=['Ohio', 'Colorado', 'Utah', 'New York'], ... columns=['one', 'two', 'three', 'four']) >>> data one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
# 利用单个label选择单列 >>> data['two'] Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int64
# 利用多个label选择多列,可以改变列顺序 >>> data[['three', 'one']] three one Ohio 2 0 Colorado 6 4 Utah 10 8 New York 14 12
# 利用boolean数组选择多行 >>> bools = np.array([False, True, False, True]) >>> bools array([False, True, False, True]) >>> data[bools] one two three four Colorado 4 5 6 7 New York 12 13 14 15
# 利用切片(slice)选择多行,类似Numpy的语法 >>> data[:2] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7
# 利用boolean DataFrame选择数据 >>> data < 5 one two three four Ohio True True True True Colorado True False False False Utah False False False False New York False False False False >>> data[data < 5] = 0 >>> data one two three four Ohio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
# 使用 df.loc[val] 选择行 >>> data.loc['Utah'] one 8 two 9 three 10 four 11 Name: Utah, dtype: int64 >>> data.loc[['Utah', 'Ohio']] one two three four Utah 8 9 10 11 Ohio 0 1 2 3 # 如果使用 label 索引,则索引区间为闭区间 >>> data.loc[:'Utah'] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11
# 使用 df.loc[:, val] 选择列 >>> data.loc[:, 'one'] Ohio 0 Colorado 4 Utah 8 New York 12 Name: one, dtype: int64 >>> data.loc[:, ['one', 'two']] one two Ohio 0 1 Colorado 4 5 Utah 8 9 New York 12 13 # 如果使用 label 索引,则索引区间为闭区间 >>> data.loc[:, :'two'] one two Ohio 0 1 Colorado 4 5 Utah 8 9 New York 12 13
# 使用 df.loc[val1, val2] 选择多行多列 >>> data.loc[['Colorado', 'Ohio'], ['two', 'three']] two three Colorado 5 6 Ohio 1 2
# 使用 df.iloc[where] 选择行 >>> data.iloc[2] one 8 two 9 three 10 four 11 Name: Utah, dtype: int64 >>> data.iloc[[2,1]] one two three four Utah 8 9 10 11 Colorado 4 5 6 7 >>> data.iloc[:2] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7
# 使用 df.iloc[:, where] 选择列 >>> data.iloc[:, 1] Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int64 >>> data.iloc[:, [2, 0]] three one Ohio 2 0 Colorado 6 4 Utah 10 8 New York 14 12 >>> data.iloc[:, :2] one two Ohio 0 1 Colorado 4 5 Utah 8 9 New York 12 13
# 使用 df.iloc[where_i, where_j] 选择多行多列 >>> data.iloc[2, :2] one 8 two 9 Name: Utah, dtype: int64 >>> data.iloc[:2, :2] one two Ohio 0 1 Colorado 4 5
>>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]}) >>> df name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT >>> df.dropna() name toy born 1 Batman Batmobile 1940-04-25
删除有缺失值的列
设定dropna函数的参数axis为1可以删除有缺失值的列
1 2 3 4 5 6 7 8 9 10 11 12 13
>>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]}) >>> df name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT >>> df.dropna(axis=1) name 0 Alfred 1 Batman 2 Catwoman
8. 重复值
已知DataFrame
1 2 3 4 5 6 7 8 9 10 11
>>> data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], ... 'k2': [1, 1, 2, 3, 3, 4, 4]}) >>> data k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4 6 two 4
8.1. 删除重复值
可以使用函数drop_duplicates()删除重复值
1 2 3 4 5 6 7 8
>>> data.drop_duplicates() k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4
函数drop_duplicates()默认考虑的是全部列,也可以设定某些列来判断是否重复
1 2 3 4
>>> data.drop_duplicates(['k1']) k1 k2 0 one 1 1 two 1
>>> df.sort_values(by='a') a b 0 0 4 2 0 -3 1 1 7 3 1 2 >>> df.sort_values(by=['a', 'b']) a b 2 0 -3 0 0 4 3 1 2 1 1 7
10. GroupBy
已知DataFrame
1 2 3 4 5 6 7 8 9 10 11
>>> df = pd.DataFrame({'key1':['a','a','b','b','a'], ... 'key2':['one','two','one','two','one'], ... 'data1':np.random.randn(5), ... 'data2':np.random.randn(5)}) >>> df data1 data2 key1 key2 0 2.462027 0.054159 a one 1 0.283423 -0.658160 a two 2 -0.969307 -0.407126 b one 3 -0.636756 1.925338 b two 4 -0.408266 1.833710 a one
>>> df1 = pd.DataFrame({'A': ['a', 'b', 'c']}) >>> df1 A 0 a 1 b 2 c >>> df2 = pd.DataFrame({'B': [0, 1, 2]}) >>> df2 B 0 0 1 1 2 2 >>> pd.merge(df1.assign(foo=0), df2.assign(foo=0), on=['foo']).drop(columns=['foo']) A B 0 a 0 1 a 1 2 a 2 3 b 0 4 b 1 5 b 2 6 c 0 7 c 1 8 c 2
import os import requests from retrying import retry from io import BytesIO from PIL import Image import progressbar import concurrent.futures as concurrent
i = 0 total = len(image_urls) pbar = progressbar.ProgressBar(max_value=total).start() for task in concurrent.as_completed(tasks): url, file_path = tasks[task] try: task.result() i = i + 1 pbar.update(i) except Exception as exc: print('{} generated an exception: {}'.format(url, exc)) pbar.finish()
4. 遍历文件夹中所有文件
首先目录结构如下:
1 2 3 4 5 6 7
$ tree test test ├── 1.txt ├── 2.txt └── test2 ├── 3.txt └── 4.txt
使用os.walk()遍历test目录,代码如下:
1 2 3 4 5 6
root_dir = '/tmp/test' for root, dirs, files in os.walk(root_dir, topdown=True): for name in files: print(os.path.join(root, name)) for name in dirs: print(os.path.join(root, name))
deftimethis(func): """ Decorator that reports the execution time """ @wraps(func) defwrapper(*args, **kwargs): start = time.time() result = func(*args, **kwargs) end = time.time() print(func.__name__, end-start)
if __name__ == "__main__": a = [1, 2] b = 3 print(f"'a' is {'iterable'if isiterable(a) else'not iterable'}") print(f"'b' is {'iterable'if isiterable(b) else'not iterable'}")
目标检测算法定位该汽车的位置时找出了一堆检测框,此时我们需要去除冗余的检测框。假设目标算法找到了6个检测框,而且算法还提供了每个框中内容属于汽车的概率或者得分 (在RCNN中,使用SVM计算检测框属于该类别的得分),NMS方法首先根据得分大小对检测框进行排序,假设从小到大的排序为 A, B, C, D, E, F。
从最大得分的检测框 F 开始,分别判断 A~E 与 F 的重叠度IoU是否大于某个设定的阈值。
假设检测框 B、D 与 F 的重叠度超过阈值,那么就抛弃 B、D,并将检测框 F 标记为要保留的检测框。
第2步去掉 B 和 D 后,剩余检测框 A、C、E,接着在剩下的检测框中选择得分最大检测框 E,然后判断 E 和 A、C 的重叠度,如果重叠度大于设定阈值,那就抛弃该检测框,否则留到下一轮的筛选过程,并将检测框 E 标记为要保留的检测框。
重复步骤 3,直到剩余待筛选的框个数为 0。
实现
关于单类别的NMS的实现,网上已经有实现好的,这里贴出 Ross Girshick (RCNN提出者) 写的Python实现 py_cpu_nms.py,并且加了个人的标注。