Pandas¶

Cheat Sheet

import pandas as pd

df = pd.DataFrame({
    "a" : [4 ,4, 6, 5],
    "b" : [7.0, 8.0, 9.0, 15.0],
    "c" : [10.0, 11.0, 12.0, 20.0],
    "text": ['我', '今天', '沒', '吃飽']
})

df

	a	b	c	text
0	4	7.0	10.0	我
1	4	8.0	11.0	今天
2	6	9.0	12.0	沒
3	5	15.0	20.0	吃飽

Basic Data Frame Manitpulation¶

Sort Values¶

df.sort_values('a')
df.sort_values('a',ascending=False)

Columns¶

df.drop(columns=['a','c'])
df[list_of_COLNAMES]: df[['a','b','c']]
- returns df
df.select_dtypes(include=['float64'])
df.select_dtypes(exclude=['float64'])
df[COLNAME]: df['width']
- returns series

Rows¶

df.head(), df.tail()
df.iloc[:3]
df[df["a"] > 4], df[~(df["a"] > 4)] df[df['a'].isin({5, 6})]
df.drop_duplicates()
df.sample(frac=0.5), df.sample(n=100)

Rows + Columns¶

df.iloc[:3, [0, 2]]
df.iloc[:3][['a', 'b']]

Cell Value¶

df.at[num, colname]:

df.at[0, 'c']
assign: df.at[0, 'new_text'] = '今天星期一'

Text Processing¶

df[df["text"].str.match('..')]

	a	b	c	text
1	4	8.0	11.0	今天
3	5	15.0	20.0	吃飽

df[df["text"].isin({'我', '沒'})]

	a	b	c	text
0	4	7.0	10.0	我
2	6	9.0	12.0	沒

df.at[0, 'new_text'] = '今天星期一'
df

	a	b	c	text	new_text
0	4	7.0	10.0	我	今天星期一
1	4	8.0	11.0	今天	NaN
2	6	9.0	12.0	沒	NaN
3	5	15.0	20.0	吃飽	NaN

Iteration¶

for i, row in df.iterrows():
    print(i)
    print(f"Cell 0: {row[0]}")
    print(f"Cell 1: {row[1]}")
    print(f"Cell 2: {row[2]}")
    print(f"Cell 3: {row[3]}")
    print()

0
Cell 0: 4
Cell 1: 7.0
Cell 2: 10.0
Cell 3: 我

1
Cell 0: 4
Cell 1: 8.0
Cell 2: 11.0
Cell 3: 今天

2
Cell 0: 6
Cell 1: 9.0
Cell 2: 12.0
Cell 3: 沒

3
Cell 0: 5
Cell 1: 15.0
Cell 2: 20.0
Cell 3: 吃飽

day = list('日一二三四五六')

for i, row in df.iterrows():
    df.at[i, 'new_text'] = f"今天星期{day[i]}"

df

	a	b	c	text	new_text
0	4	7.0	10.0	我	今天星期日
1	4	8.0	11.0	今天	今天星期一
2	6	9.0	12.0	沒	今天星期二
3	5	15.0	20.0	吃飽	今天星期三

for col in df:
    print(f"Colname: {col}")

Colname: a
Colname: b
Colname: c
Colname: text
Colname: new_text

Plotting¶

Setting up `matplotlib` display options in Jupyter notebook¶

import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams['font.family'] = ['AR PL KaitiM Big5']  # Custom font (installed on computer, .ttf format)
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']    # Custom font (installed on computer, .ttf format)
plt.rcParams.update({'font.size': 18})
plt.rcParams['axes.unicode_minus'] = False

df['a'].plot.hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7faf0318ac50>

df[['a', 'b']].plot.hist(alpha=0.5)

<matplotlib.axes._subplots.AxesSubplot at 0x7faf00c39710>

df[['a', 'b']].plot.hist(subplots=True, layout=(1,2), figsize=(10, 4))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7faf010cb150>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7faf00b31ed0>]],
      dtype=object)

df.plot.bar(x='text', y='b', rot=55) 

<matplotlib.axes._subplots.AxesSubplot at 0x7faf00a59f10>

Miscellaneous¶

df.corr()

	a	b	c
a	1.000000	0.363270	0.323531
b	0.363270	1.000000	0.998765
c	0.323531	0.998765	1.000000

df.select_dtypes(exclude=['float64'])

	a	text	new_text
0	4	我	今天星期日
1	4	今天	今天星期一
2	6	沒	今天星期二
3	5	吃飽	今天星期三

df.select_dtypes(include=['float64']).to_numpy()

array([[ 7., 10.],
       [ 8., 11.],
       [ 9., 12.],
       [15., 20.]])

Python Notes

Pandas¶

Basic Data Frame Manitpulation¶

Sort Values¶

Columns¶

Rows¶

Rows + Columns¶

Cell Value¶

Text Processing¶

Iteration¶

Plotting¶

Setting up matplotlib display options in Jupyter notebook¶

Miscellaneous¶

Setting up `matplotlib` display options in Jupyter notebook¶