>>> obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd']) >>> obj a 0.0 b 1.0 c 2.0 d 3.0 dtype: float64 >>> obj['b'] 1.0 >>> obj[1] 1.0 >>> obj[:2] a 0.0 b 1.0 dtype: float64 >>> obj[obj < 2] a 0.0 b 1.0 dtype: float64
# 如果使用 label 索引,则索引区间为闭区间 >>> obj['b':'c'] b 1.0 c 2.0 dtype: float64
5. DataFrame索引
先贴张DataFrame索引方法的表格,摘录自《Python for Data Analysis》。
Type
Notes
df[val]
Select single column or sequence of columns from the DataFrame; special case conveniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values bases on some criterion)
df.loc[val]
Selects single row or subset of rows from the DataFrame by label
df.loc[:, val]
Selects single column or subset of columns by label
df.loc[val1, val2]
Select both rows and columns by label
df.iloc[where]
Selects single row of subsets of rows from the DataFrame by integer position
df.iloc[:, where]
Selects single column or subset of columns by integer position
df.iloc[where_i, where_j]
Select both rows and columns by integer position
df.at[label_i, label_j]
Select a single scalar value by row and column label
df.iat[i, j]
Select a single scalar value by row and column position (integers)
先创建一个数据
1 2 3 4 5 6 7 8 9
>>> data = pd.DataFrame(np.arange(16).reshape((4, 4)), ... index=['Ohio', 'Colorado', 'Utah', 'New York'], ... columns=['one', 'two', 'three', 'four']) >>> data one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
# 利用单个label选择单列 >>> data['two'] Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int64
# 利用多个label选择多列,可以改变列顺序 >>> data[['three', 'one']] three one Ohio 2 0 Colorado 6 4 Utah 10 8 New York 14 12
# 利用boolean数组选择多行 >>> bools = np.array([False, True, False, True]) >>> bools array([False, True, False, True]) >>> data[bools] one two three four Colorado 4 5 6 7 New York 12 13 14 15
# 利用切片(slice)选择多行,类似Numpy的语法 >>> data[:2] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7
# 利用boolean DataFrame选择数据 >>> data < 5 one two three four Ohio True True True True Colorado True False False False Utah False False False False New York False False False False >>> data[data < 5] = 0 >>> data one two three four Ohio 0 0 0 0 Colorado 0 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
# 使用 df.loc[val] 选择行 >>> data.loc['Utah'] one 8 two 9 three 10 four 11 Name: Utah, dtype: int64 >>> data.loc[['Utah', 'Ohio']] one two three four Utah 8 9 10 11 Ohio 0 1 2 3 # 如果使用 label 索引,则索引区间为闭区间 >>> data.loc[:'Utah'] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11
# 使用 df.loc[:, val] 选择列 >>> data.loc[:, 'one'] Ohio 0 Colorado 4 Utah 8 New York 12 Name: one, dtype: int64 >>> data.loc[:, ['one', 'two']] one two Ohio 0 1 Colorado 4 5 Utah 8 9 New York 12 13 # 如果使用 label 索引,则索引区间为闭区间 >>> data.loc[:, :'two'] one two Ohio 0 1 Colorado 4 5 Utah 8 9 New York 12 13
# 使用 df.loc[val1, val2] 选择多行多列 >>> data.loc[['Colorado', 'Ohio'], ['two', 'three']] two three Colorado 5 6 Ohio 1 2
# 使用 df.iloc[where] 选择行 >>> data.iloc[2] one 8 two 9 three 10 four 11 Name: Utah, dtype: int64 >>> data.iloc[[2,1]] one two three four Utah 8 9 10 11 Colorado 4 5 6 7 >>> data.iloc[:2] one two three four Ohio 0 1 2 3 Colorado 4 5 6 7
# 使用 df.iloc[:, where] 选择列 >>> data.iloc[:, 1] Ohio 1 Colorado 5 Utah 9 New York 13 Name: two, dtype: int64 >>> data.iloc[:, [2, 0]] three one Ohio 2 0 Colorado 6 4 Utah 10 8 New York 14 12 >>> data.iloc[:, :2] one two Ohio 0 1 Colorado 4 5 Utah 8 9 New York 12 13
# 使用 df.iloc[where_i, where_j] 选择多行多列 >>> data.iloc[2, :2] one 8 two 9 Name: Utah, dtype: int64 >>> data.iloc[:2, :2] one two Ohio 0 1 Colorado 4 5
>>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]}) >>> df name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT >>> df.dropna() name toy born 1 Batman Batmobile 1940-04-25
删除有缺失值的列
设定dropna函数的参数axis为1可以删除有缺失值的列
1 2 3 4 5 6 7 8 9 10 11 12 13
>>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]}) >>> df name toy born 0 Alfred NaN NaT 1 Batman Batmobile 1940-04-25 2 Catwoman Bullwhip NaT >>> df.dropna(axis=1) name 0 Alfred 1 Batman 2 Catwoman
8. 重复值
已知DataFrame
1 2 3 4 5 6 7 8 9 10 11
>>> data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], ... 'k2': [1, 1, 2, 3, 3, 4, 4]}) >>> data k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4 6 two 4
8.1. 删除重复值
可以使用函数drop_duplicates()删除重复值
1 2 3 4 5 6 7 8
>>> data.drop_duplicates() k1 k2 0 one 1 1 two 1 2 one 2 3 two 3 4 one 3 5 two 4
函数drop_duplicates()默认考虑的是全部列,也可以设定某些列来判断是否重复
1 2 3 4
>>> data.drop_duplicates(['k1']) k1 k2 0 one 1 1 two 1
>>> df.sort_values(by='a') a b 0 0 4 2 0 -3 1 1 7 3 1 2 >>> df.sort_values(by=['a', 'b']) a b 2 0 -3 0 0 4 3 1 2 1 1 7
10. GroupBy
已知DataFrame
1 2 3 4 5 6 7 8 9 10 11
>>> df = pd.DataFrame({'key1':['a','a','b','b','a'], ... 'key2':['one','two','one','two','one'], ... 'data1':np.random.randn(5), ... 'data2':np.random.randn(5)}) >>> df data1 data2 key1 key2 0 2.462027 0.054159 a one 1 0.283423 -0.658160 a two 2 -0.969307 -0.407126 b one 3 -0.636756 1.925338 b two 4 -0.408266 1.833710 a one
>>> df1 = pd.DataFrame({'A': ['a', 'b', 'c']}) >>> df1 A 0 a 1 b 2 c >>> df2 = pd.DataFrame({'B': [0, 1, 2]}) >>> df2 B 0 0 1 1 2 2 >>> pd.merge(df1.assign(foo=0), df2.assign(foo=0), on=['foo']).drop(columns=['foo']) A B 0 a 0 1 a 1 2 a 2 3 b 0 4 b 1 5 b 2 6 c 0 7 c 1 8 c 2