Using pandas#
import pandas as pd
import numpy as np # for generating random numbers
Series#
# series
s1 = pd.Series([10, 11, 12], dtype = float)
s1
0 10.0
1 11.0
2 12.0
dtype: float64
s2 = pd.Series([10, 11, 12], dtype = int)
s2
0 10
1 11
2 12
dtype: int64
s3 = pd.Series(["10", "11", "12"], name = 'Val', index = pd.Index(['a', 'b', 'c'], name = 'label'))
s3
label
a 10
b 11
c 12
Name: Val, dtype: object
s1.index
RangeIndex(start=0, stop=3, step=1)
# s1.append(s2)
s4 = pd.concat([s1, s3])
s4
0 10.0
1 11.0
2 12.0
a 10
b 11
c 12
dtype: object
print(s4[0])
print(s4)
s4_reset = s4.reset_index(drop=True)
print(s4_reset)
s4_reset[0]
10.0
0 10.0
1 11.0
2 12.0
a 10
b 11
c 12
dtype: object
0 10.0
1 11.0
2 12.0
3 10
4 11
5 12
dtype: object
10.0
dates = pd.date_range('20190503', periods=7)
dates
DatetimeIndex(['2019-05-03', '2019-05-04', '2019-05-05', '2019-05-06',
'2019-05-07', '2019-05-08', '2019-05-09'],
dtype='datetime64[ns]', freq='D')
DataFrame#
df = pd.DataFrame(np.random.randn(7, 4), index=dates, columns=list('ABCD'))
df
A | B | C | D | |
---|---|---|---|---|
2019-05-03 | 1.106142 | -1.411186 | -0.664336 | -0.998144 |
2019-05-04 | -0.219846 | -0.562399 | 0.882633 | -0.183028 |
2019-05-05 | 0.130602 | -0.559537 | -0.683457 | 1.393433 |
2019-05-06 | 2.234397 | 1.709079 | 0.610843 | 0.119211 |
2019-05-07 | 0.447079 | -0.928171 | 1.102700 | 0.010732 |
2019-05-08 | -0.406803 | 1.557577 | -1.035966 | -0.112492 |
2019-05-09 | -0.767276 | -0.932870 | -1.022550 | 0.422754 |
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(2, 6)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
df2
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
4 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
5 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
df2.dtypes
A float64
B datetime64[s]
C float32
D int32
E category
F object
dtype: object
df = pd.read_csv('heart.csv')
df
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | typical | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0.0 | fixed | No |
1 | 2 | 67 | 1 | asymptomatic | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3.0 | normal | Yes |
2 | 3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes |
3 | 4 | 37 | 1 | nonanginal | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0.0 | normal | No |
4 | 5 | 41 | 0 | nontypical | 130 | 204 | 0 | 2 | 172 | 0 | 1.4 | 1 | 0.0 | normal | No |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
298 | 299 | 45 | 1 | typical | 110 | 264 | 0 | 0 | 132 | 0 | 1.2 | 2 | 0.0 | reversable | Yes |
299 | 300 | 68 | 1 | asymptomatic | 144 | 193 | 1 | 0 | 141 | 0 | 3.4 | 2 | 2.0 | reversable | Yes |
300 | 301 | 57 | 1 | asymptomatic | 130 | 131 | 0 | 0 | 115 | 1 | 1.2 | 2 | 1.0 | reversable | Yes |
301 | 302 | 57 | 0 | nontypical | 130 | 236 | 0 | 2 | 174 | 0 | 0.0 | 2 | 1.0 | normal | Yes |
302 | 303 | 38 | 1 | nonanginal | 138 | 175 | 0 | 0 | 173 | 0 | 0.0 | 1 | NaN | normal | No |
303 rows × 15 columns
Viewing Data#
df.head(10)
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | typical | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0.0 | fixed | No |
1 | 2 | 67 | 1 | asymptomatic | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3.0 | normal | Yes |
2 | 3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes |
3 | 4 | 37 | 1 | nonanginal | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0.0 | normal | No |
4 | 5 | 41 | 0 | nontypical | 130 | 204 | 0 | 2 | 172 | 0 | 1.4 | 1 | 0.0 | normal | No |
5 | 6 | 56 | 1 | nontypical | 120 | 236 | 0 | 0 | 178 | 0 | 0.8 | 1 | 0.0 | normal | No |
6 | 7 | 62 | 0 | asymptomatic | 140 | 268 | 0 | 2 | 160 | 0 | 3.6 | 3 | 2.0 | normal | Yes |
7 | 8 | 57 | 0 | asymptomatic | 120 | 354 | 0 | 0 | 163 | 1 | 0.6 | 1 | 0.0 | normal | No |
8 | 9 | 63 | 1 | asymptomatic | 130 | 254 | 0 | 2 | 147 | 0 | 1.4 | 2 | 1.0 | reversable | Yes |
9 | 10 | 53 | 1 | asymptomatic | 140 | 203 | 1 | 2 | 155 | 1 | 3.1 | 3 | 0.0 | reversable | Yes |
df.tail(8)
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
295 | 296 | 41 | 1 | nontypical | 120 | 157 | 0 | 0 | 182 | 0 | 0.0 | 1 | 0.0 | normal | No |
296 | 297 | 59 | 1 | asymptomatic | 164 | 176 | 1 | 2 | 90 | 0 | 1.0 | 2 | 2.0 | fixed | Yes |
297 | 298 | 57 | 0 | asymptomatic | 140 | 241 | 0 | 0 | 123 | 1 | 0.2 | 2 | 0.0 | reversable | Yes |
298 | 299 | 45 | 1 | typical | 110 | 264 | 0 | 0 | 132 | 0 | 1.2 | 2 | 0.0 | reversable | Yes |
299 | 300 | 68 | 1 | asymptomatic | 144 | 193 | 1 | 0 | 141 | 0 | 3.4 | 2 | 2.0 | reversable | Yes |
300 | 301 | 57 | 1 | asymptomatic | 130 | 131 | 0 | 0 | 115 | 1 | 1.2 | 2 | 1.0 | reversable | Yes |
301 | 302 | 57 | 0 | nontypical | 130 | 236 | 0 | 2 | 174 | 0 | 0.0 | 2 | 1.0 | normal | Yes |
302 | 303 | 38 | 1 | nonanginal | 138 | 175 | 0 | 0 | 173 | 0 | 0.0 | 1 | NaN | normal | No |
df.describe()
Unnamed: 0 | Age | Sex | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 303.000000 | 299.000000 |
mean | 152.000000 | 54.438944 | 0.679868 | 131.689769 | 246.693069 | 0.148515 | 0.990099 | 149.607261 | 0.326733 | 1.039604 | 1.600660 | 0.672241 |
std | 87.612784 | 9.038662 | 0.467299 | 17.599748 | 51.776918 | 0.356198 | 0.994971 | 22.875003 | 0.469794 | 1.161075 | 0.616226 | 0.937438 |
min | 1.000000 | 29.000000 | 0.000000 | 94.000000 | 126.000000 | 0.000000 | 0.000000 | 71.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
25% | 76.500000 | 48.000000 | 0.000000 | 120.000000 | 211.000000 | 0.000000 | 0.000000 | 133.500000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
50% | 152.000000 | 56.000000 | 1.000000 | 130.000000 | 241.000000 | 0.000000 | 1.000000 | 153.000000 | 0.000000 | 0.800000 | 2.000000 | 0.000000 |
75% | 227.500000 | 61.000000 | 1.000000 | 140.000000 | 275.000000 | 0.000000 | 2.000000 | 166.000000 | 1.000000 | 1.600000 | 2.000000 | 1.000000 |
max | 303.000000 | 77.000000 | 1.000000 | 200.000000 | 564.000000 | 1.000000 | 2.000000 | 202.000000 | 1.000000 | 6.200000 | 3.000000 | 3.000000 |
df.index
RangeIndex(start=0, stop=303, step=1)
df.columns
Index(['Unnamed: 0', 'Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs',
'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'AHD'],
dtype='object')
# df.sort_index(axis=0, ascending=True)
# df.sort_index(axis=0, ascending=False)
df.sort_index(axis=1, ascending=True)
df.sort_index(axis=1, ascending=False)
Unnamed: 0 | Thal | Slope | Sex | RestECG | RestBP | Oldpeak | MaxHR | Fbs | ExAng | Chol | ChestPain | Ca | Age | AHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | fixed | 3 | 1 | 2 | 145 | 2.3 | 150 | 1 | 0 | 233 | typical | 0.0 | 63 | No |
1 | 2 | normal | 2 | 1 | 2 | 160 | 1.5 | 108 | 0 | 1 | 286 | asymptomatic | 3.0 | 67 | Yes |
2 | 3 | reversable | 2 | 1 | 2 | 120 | 2.6 | 129 | 0 | 1 | 229 | asymptomatic | 2.0 | 67 | Yes |
3 | 4 | normal | 3 | 1 | 0 | 130 | 3.5 | 187 | 0 | 0 | 250 | nonanginal | 0.0 | 37 | No |
4 | 5 | normal | 1 | 0 | 2 | 130 | 1.4 | 172 | 0 | 0 | 204 | nontypical | 0.0 | 41 | No |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
298 | 299 | reversable | 2 | 1 | 0 | 110 | 1.2 | 132 | 0 | 0 | 264 | typical | 0.0 | 45 | Yes |
299 | 300 | reversable | 2 | 1 | 0 | 144 | 3.4 | 141 | 1 | 0 | 193 | asymptomatic | 2.0 | 68 | Yes |
300 | 301 | reversable | 2 | 1 | 0 | 130 | 1.2 | 115 | 0 | 1 | 131 | asymptomatic | 1.0 | 57 | Yes |
301 | 302 | normal | 2 | 0 | 2 | 130 | 0.0 | 174 | 0 | 0 | 236 | nontypical | 1.0 | 57 | Yes |
302 | 303 | normal | 1 | 1 | 0 | 138 | 0.0 | 173 | 0 | 0 | 175 | nonanginal | NaN | 38 | No |
303 rows × 15 columns
df.sort_values(by = 'Sex', ascending = False)
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
302 | 303 | 38 | 1 | nonanginal | 138 | 175 | 0 | 0 | 173 | 0 | 0.0 | 1 | NaN | normal | No |
0 | 1 | 63 | 1 | typical | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0.0 | fixed | No |
1 | 2 | 67 | 1 | asymptomatic | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3.0 | normal | Yes |
2 | 3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes |
3 | 4 | 37 | 1 | nonanginal | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0.0 | normal | No |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
26 | 27 | 58 | 0 | nonanginal | 120 | 340 | 0 | 0 | 172 | 0 | 0.0 | 1 | 0.0 | normal | No |
27 | 28 | 66 | 0 | typical | 150 | 226 | 0 | 0 | 114 | 0 | 2.6 | 3 | 0.0 | normal | No |
30 | 31 | 69 | 0 | typical | 140 | 239 | 0 | 0 | 151 | 0 | 1.8 | 1 | 2.0 | normal | No |
40 | 41 | 65 | 0 | asymptomatic | 150 | 225 | 0 | 2 | 114 | 0 | 1.0 | 2 | 3.0 | reversable | Yes |
42 | 43 | 71 | 0 | nontypical | 160 | 302 | 0 | 0 | 162 | 0 | 0.4 | 1 | 2.0 | normal | No |
303 rows × 15 columns
df.T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Unnamed: 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 |
Age | 63 | 67 | 67 | 37 | 41 | 56 | 62 | 57 | 63 | 53 | ... | 63 | 63 | 41 | 59 | 57 | 45 | 68 | 57 | 57 | 38 |
Sex | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 |
ChestPain | typical | asymptomatic | asymptomatic | nonanginal | nontypical | nontypical | asymptomatic | asymptomatic | asymptomatic | asymptomatic | ... | asymptomatic | asymptomatic | nontypical | asymptomatic | asymptomatic | typical | asymptomatic | asymptomatic | nontypical | nonanginal |
RestBP | 145 | 160 | 120 | 130 | 130 | 120 | 140 | 120 | 130 | 140 | ... | 140 | 124 | 120 | 164 | 140 | 110 | 144 | 130 | 130 | 138 |
Chol | 233 | 286 | 229 | 250 | 204 | 236 | 268 | 354 | 254 | 203 | ... | 187 | 197 | 157 | 176 | 241 | 264 | 193 | 131 | 236 | 175 |
Fbs | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
RestECG | 2 | 2 | 2 | 0 | 2 | 0 | 2 | 0 | 2 | 2 | ... | 2 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | 0 |
MaxHR | 150 | 108 | 129 | 187 | 172 | 178 | 160 | 163 | 147 | 155 | ... | 144 | 136 | 182 | 90 | 123 | 132 | 141 | 115 | 174 | 173 |
ExAng | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | ... | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
Oldpeak | 2.3 | 1.5 | 2.6 | 3.5 | 1.4 | 0.8 | 3.6 | 0.6 | 1.4 | 3.1 | ... | 4.0 | 0.0 | 0.0 | 1.0 | 0.2 | 1.2 | 3.4 | 1.2 | 0.0 | 0.0 |
Slope | 3 | 2 | 2 | 3 | 1 | 1 | 3 | 1 | 2 | 3 | ... | 1 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 1 |
Ca | 0.0 | 3.0 | 2.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 | 0.0 | ... | 2.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 2.0 | 1.0 | 1.0 | NaN |
Thal | fixed | normal | reversable | normal | normal | normal | normal | normal | reversable | reversable | ... | reversable | normal | normal | fixed | reversable | reversable | reversable | reversable | normal | normal |
AHD | No | Yes | Yes | No | No | No | Yes | No | Yes | Yes | ... | Yes | Yes | No | Yes | Yes | Yes | Yes | Yes | Yes | No |
15 rows × 303 columns
df['Age']
0 63
1 67
2 67
3 37
4 41
..
298 45
299 68
300 57
301 57
302 38
Name: Age, Length: 303, dtype: int64
df[['Age', 'Sex']]
Age | Sex | |
---|---|---|
0 | 63 | 1 |
1 | 67 | 1 |
2 | 67 | 1 |
3 | 37 | 1 |
4 | 41 | 0 |
... | ... | ... |
298 | 45 | 1 |
299 | 68 | 1 |
300 | 57 | 1 |
301 | 57 | 0 |
302 | 38 | 1 |
303 rows × 2 columns
df[['Age', 'Sex']][0:2]
Age | Sex | |
---|---|---|
0 | 63 | 1 |
1 | 67 | 1 |
df[0:2][['Age']]
Age | |
---|---|
0 | 63 |
1 | 67 |
Data Selection#
df.loc[4]
# df
Unnamed: 0 5
Age 41
Sex 0
ChestPain nontypical
RestBP 130
Chol 204
Fbs 0
RestECG 2
MaxHR 172
ExAng 0
Oldpeak 1.4
Slope 1
Ca 0.0
Thal normal
AHD No
Name: 4, dtype: object
df.loc[[3,5], ['Age', 'Fbs']]
Age | Fbs | |
---|---|---|
3 | 37 | 0 |
5 | 56 | 0 |
df.iloc[5]
Unnamed: 0 6
Age 56
Sex 1
ChestPain nontypical
RestBP 120
Chol 236
Fbs 0
RestECG 0
MaxHR 178
ExAng 0
Oldpeak 0.8
Slope 1
Ca 0.0
Thal normal
AHD No
Name: 5, dtype: object
df.iloc[5, 3]
'nontypical'
# df.iloc[5:7, 3:7]
# df.iloc[[5, 7, 9], 3:7]
# df.iloc[:, 3]
df.iloc[3: 5, :]
# df.loc[5:7, 3:7]
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 4 | 37 | 1 | nonanginal | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0.0 | normal | No |
4 | 5 | 41 | 0 | nontypical | 130 | 204 | 0 | 2 | 172 | 0 | 1.4 | 1 | 0.0 | normal | No |
df.iloc[1, 1]
df.at[1, 'Age'] # Get scalar values. It's a very fast loc
df.iat[1, 1] # Get scalar values. It's a very fast iloc
67
df['Age'] > 50
0 True
1 True
2 True
3 False
4 False
...
298 False
299 True
300 True
301 True
302 False
Name: Age, Length: 303, dtype: bool
df[df.Age > 50]
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | typical | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0.0 | fixed | No |
1 | 2 | 67 | 1 | asymptomatic | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3.0 | normal | Yes |
2 | 3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes |
5 | 6 | 56 | 1 | nontypical | 120 | 236 | 0 | 0 | 178 | 0 | 0.8 | 1 | 0.0 | normal | No |
6 | 7 | 62 | 0 | asymptomatic | 140 | 268 | 0 | 2 | 160 | 0 | 3.6 | 3 | 2.0 | normal | Yes |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
296 | 297 | 59 | 1 | asymptomatic | 164 | 176 | 1 | 2 | 90 | 0 | 1.0 | 2 | 2.0 | fixed | Yes |
297 | 298 | 57 | 0 | asymptomatic | 140 | 241 | 0 | 0 | 123 | 1 | 0.2 | 2 | 0.0 | reversable | Yes |
299 | 300 | 68 | 1 | asymptomatic | 144 | 193 | 1 | 0 | 141 | 0 | 3.4 | 2 | 2.0 | reversable | Yes |
300 | 301 | 57 | 1 | asymptomatic | 130 | 131 | 0 | 0 | 115 | 1 | 1.2 | 2 | 1.0 | reversable | Yes |
301 | 302 | 57 | 0 | nontypical | 130 | 236 | 0 | 2 | 174 | 0 | 0.0 | 2 | 1.0 | normal | Yes |
209 rows × 15 columns
df2 = df.copy()
df2 = df2.iloc[0: 5, :]
df2
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | typical | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0.0 | fixed | No |
1 | 2 | 67 | 1 | asymptomatic | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3.0 | normal | Yes |
2 | 3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes |
3 | 4 | 37 | 1 | nonanginal | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0.0 | normal | No |
4 | 5 | 41 | 0 | nontypical | 130 | 204 | 0 | 2 | 172 | 0 | 1.4 | 1 | 0.0 | normal | No |
# dfa['A'] = list(range(len(dfa.index))) # use this form to create a new column
df2['cp'] = ['three', 'two', 'one', 'one', 'zero']
df2
df2.cp.isin(['one', 'zero'])
0 False
1 False
2 True
3 True
4 True
Name: cp, dtype: bool
df2[df2.cp.isin(['one', 'zero'])]
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | cp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes | one |
3 | 4 | 37 | 1 | nonanginal | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0.0 | normal | No | one |
4 | 5 | 41 | 0 | nontypical | 130 | 204 | 0 | 2 | 172 | 0 | 1.4 | 1 | 0.0 | normal | No | zero |
Missing Data#
df1 = df.reindex(index=range(0, 4), columns=list(df.columns) + ['E'])
df1.loc[2:3, 'E'] = 1
df1
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | E | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | typical | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0.0 | fixed | No | NaN |
1 | 2 | 67 | 1 | asymptomatic | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3.0 | normal | Yes | NaN |
2 | 3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes | 1.0 |
3 | 4 | 37 | 1 | nonanginal | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0.0 | normal | No | 1.0 |
df1.dropna(how='any')
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | E | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes | 1.0 |
3 | 4 | 37 | 1 | nonanginal | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0.0 | normal | No | 1.0 |
pd.isna(df1)
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | E | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True |
1 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | True |
2 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
3 | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False | False |
df1.fillna(value=2)
Unnamed: 0 | Age | Sex | ChestPain | RestBP | Chol | Fbs | RestECG | MaxHR | ExAng | Oldpeak | Slope | Ca | Thal | AHD | E | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 63 | 1 | typical | 145 | 233 | 1 | 2 | 150 | 0 | 2.3 | 3 | 0.0 | fixed | No | 2.0 |
1 | 2 | 67 | 1 | asymptomatic | 160 | 286 | 0 | 2 | 108 | 1 | 1.5 | 2 | 3.0 | normal | Yes | 2.0 |
2 | 3 | 67 | 1 | asymptomatic | 120 | 229 | 0 | 2 | 129 | 1 | 2.6 | 2 | 2.0 | reversable | Yes | 1.0 |
3 | 4 | 37 | 1 | nonanginal | 130 | 250 | 0 | 0 | 187 | 0 | 3.5 | 3 | 0.0 | normal | No | 1.0 |
Operations#
df1.mean()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[40], line 1
----> 1 df1.mean()
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/frame.py:11335, in DataFrame.mean(self, axis, skipna, numeric_only, **kwargs)
11327 @doc(make_doc("mean", ndim=2))
11328 def mean(
11329 self,
(...)
11333 **kwargs,
11334 ):
> 11335 result = super().mean(axis, skipna, numeric_only, **kwargs)
11336 if isinstance(result, Series):
11337 result = result.__finalize__(self, method="mean")
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/generic.py:11984, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
11977 def mean(
11978 self,
11979 axis: Axis | None = 0,
(...)
11982 **kwargs,
11983 ) -> Series | float:
> 11984 return self._stat_function(
11985 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
11986 )
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/generic.py:11941, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
11937 nv.validate_func(name, (), kwargs)
11939 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11941 return self._reduce(
11942 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
11943 )
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/frame.py:11204, in DataFrame._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
11200 df = df.T
11202 # After possibly _get_data and transposing, we are now in the
11203 # simple case where we can use BlockManager.reduce
> 11204 res = df._mgr.reduce(blk_func)
11205 out = df._constructor_from_mgr(res, axes=res.axes).iloc[0]
11206 if out_dtype is not None and out.dtype != "boolean":
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:1459, in BlockManager.reduce(self, func)
1457 res_blocks: list[Block] = []
1458 for blk in self.blocks:
-> 1459 nbs = blk.reduce(func)
1460 res_blocks.extend(nbs)
1462 index = Index([None]) # placeholder
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/internals/blocks.py:377, in Block.reduce(self, func)
371 @final
372 def reduce(self, func) -> list[Block]:
373 # We will apply the function and reshape the result into a single-row
374 # Block with the same mgr_locs; squeezing will be done at a higher level
375 assert self.ndim == 2
--> 377 result = func(self.values)
379 if self.values.ndim == 1:
380 res_values = result
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/frame.py:11136, in DataFrame._reduce.<locals>.blk_func(values, axis)
11134 return np.array([result])
11135 else:
> 11136 return op(values, axis=axis, skipna=skipna, **kwds)
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
145 result = alt(values, axis=axis, skipna=skipna, **kwds)
146 else:
--> 147 result = alt(values, axis=axis, skipna=skipna, **kwds)
149 return result
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
401 if datetimelike and mask is None:
402 mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
406 if datetimelike:
407 result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:720, in nanmean(values, axis, skipna, mask)
718 count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
719 the_sum = values.sum(axis, dtype=dtype_sum)
--> 720 the_sum = _ensure_numeric(the_sum)
722 if axis is not None and getattr(the_sum, "ndim", False):
723 count = cast(np.ndarray, count)
File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:1678, in _ensure_numeric(x)
1675 inferred = lib.infer_dtype(x)
1676 if inferred in ["string", "mixed"]:
1677 # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1678 raise TypeError(f"Could not convert {x} to numeric")
1679 try:
1680 x = x.astype(np.complex128)
TypeError: Could not convert ['typicalasymptomaticasymptomaticnonanginal' 'fixednormalreversablenormal'
'NoYesYesNo'] to numeric
df1.mean(axis = 1)
# df1.sub([10 for _ in range(4)], axis = 'index')
df1.mul([10 for _ in range(4)], axis = 'index')
df1
df1.apply(np.cumsum)
df1.Age.apply(lambda x: x*365.25)
s = pd.Series(np.random.randint(0, 7, size=10))
s
s.value_counts()
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s
s.str.lower()
Merge#
df2 = pd.DataFrame(np.random.randn(10, 4))
pieces = [df2[:3], df2[3:7], df2[7:]]
pieces
pd.concat([pieces[2], pieces[1], pieces[0]])
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
left
right
pd.merge(left, right, on='key')
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
pd.merge(left, right, on='key')
left
right
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
# df.append(df.iloc[3])
df.append(df.iloc[3], ignore_index = True)
Grouping#
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
df
df.groupby('A').sum()
df.groupby(['A', 'B']).sum()
Plotting#
df.groupby('A').sum()['C'].plot(kind='bar')
df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df2.plot.bar()
df2.plot.bar(stacked=True);
df2.plot.barh(stacked=True)
File I/O#
df = pd.read_csv('heart.csv')
df.head()
# df.to_csv('heart_copy.csv', index = False)
# df1 = df[:10]
# df2 = df[10:20]
# df3 = df[20:40]
# with pd.ExcelWriter('path_to_file.xlsx') as writer:
# df1.to_excel(writer, sheet_name='Sheet1', index = False)
# df2.to_excel(writer, sheet_name='Sheet2', index = False)
# df3.to_excel(writer, sheet_name='Sheet3')