Using pandas#

import pandas as pd
import numpy as np # for generating random numbers

Series#

# series
s1 = pd.Series([10, 11, 12], dtype = float)
s1
0    10.0
1    11.0
2    12.0
dtype: float64
s2 = pd.Series([10, 11, 12], dtype = int)
s2
0    10
1    11
2    12
dtype: int64
s3 = pd.Series(["10", "11", "12"], name = 'Val', index = pd.Index(['a', 'b', 'c'], name = 'label'))
s3
label
a    10
b    11
c    12
Name: Val, dtype: object
s1.index
RangeIndex(start=0, stop=3, step=1)
# s1.append(s2)
s4 = pd.concat([s1, s3])
s4
0    10.0
1    11.0
2    12.0
a      10
b      11
c      12
dtype: object
print(s4[0])
print(s4)
s4_reset = s4.reset_index(drop=True)
print(s4_reset)
s4_reset[0]
10.0
0    10.0
1    11.0
2    12.0
a      10
b      11
c      12
dtype: object
0    10.0
1    11.0
2    12.0
3      10
4      11
5      12
dtype: object
10.0
dates = pd.date_range('20190503', periods=7)

dates
DatetimeIndex(['2019-05-03', '2019-05-04', '2019-05-05', '2019-05-06',
               '2019-05-07', '2019-05-08', '2019-05-09'],
              dtype='datetime64[ns]', freq='D')

DataFrame#

df = pd.DataFrame(np.random.randn(7, 4), index=dates, columns=list('ABCD'))
df
A B C D
2019-05-03 -2.251887 -1.194490 -0.496259 0.128805
2019-05-04 0.468443 1.929640 0.320742 -0.590545
2019-05-05 -0.448269 -1.052435 1.344766 -0.734297
2019-05-06 -1.938237 0.043719 0.433391 1.262041
2019-05-07 -0.660655 -0.159853 -0.224375 0.013336
2019-05-08 0.933448 0.223172 -0.192180 0.139506
2019-05-09 1.406890 -0.051580 0.037574 -0.926195
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(2, 6)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2
A B C D E F
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
4 1.0 2013-01-02 1.0 3 test foo
5 1.0 2013-01-02 1.0 3 train foo
df2.dtypes
A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object
df = pd.read_csv('heart.csv')
df
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 299 45 1 typical 110 264 0 0 132 0 1.2 2 0.0 reversable Yes
299 300 68 1 asymptomatic 144 193 1 0 141 0 3.4 2 2.0 reversable Yes
300 301 57 1 asymptomatic 130 131 0 0 115 1 1.2 2 1.0 reversable Yes
301 302 57 0 nontypical 130 236 0 2 174 0 0.0 2 1.0 normal Yes
302 303 38 1 nonanginal 138 175 0 0 173 0 0.0 1 NaN normal No

303 rows × 15 columns

Viewing Data#

df.head(10)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
5 6 56 1 nontypical 120 236 0 0 178 0 0.8 1 0.0 normal No
6 7 62 0 asymptomatic 140 268 0 2 160 0 3.6 3 2.0 normal Yes
7 8 57 0 asymptomatic 120 354 0 0 163 1 0.6 1 0.0 normal No
8 9 63 1 asymptomatic 130 254 0 2 147 0 1.4 2 1.0 reversable Yes
9 10 53 1 asymptomatic 140 203 1 2 155 1 3.1 3 0.0 reversable Yes
df.tail(8)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
295 296 41 1 nontypical 120 157 0 0 182 0 0.0 1 0.0 normal No
296 297 59 1 asymptomatic 164 176 1 2 90 0 1.0 2 2.0 fixed Yes
297 298 57 0 asymptomatic 140 241 0 0 123 1 0.2 2 0.0 reversable Yes
298 299 45 1 typical 110 264 0 0 132 0 1.2 2 0.0 reversable Yes
299 300 68 1 asymptomatic 144 193 1 0 141 0 3.4 2 2.0 reversable Yes
300 301 57 1 asymptomatic 130 131 0 0 115 1 1.2 2 1.0 reversable Yes
301 302 57 0 nontypical 130 236 0 2 174 0 0.0 2 1.0 normal Yes
302 303 38 1 nonanginal 138 175 0 0 173 0 0.0 1 NaN normal No
df.describe()
Unnamed: 0 Age Sex RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca
count 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 299.000000
mean 152.000000 54.438944 0.679868 131.689769 246.693069 0.148515 0.990099 149.607261 0.326733 1.039604 1.600660 0.672241
std 87.612784 9.038662 0.467299 17.599748 51.776918 0.356198 0.994971 22.875003 0.469794 1.161075 0.616226 0.937438
min 1.000000 29.000000 0.000000 94.000000 126.000000 0.000000 0.000000 71.000000 0.000000 0.000000 1.000000 0.000000
25% 76.500000 48.000000 0.000000 120.000000 211.000000 0.000000 0.000000 133.500000 0.000000 0.000000 1.000000 0.000000
50% 152.000000 56.000000 1.000000 130.000000 241.000000 0.000000 1.000000 153.000000 0.000000 0.800000 2.000000 0.000000
75% 227.500000 61.000000 1.000000 140.000000 275.000000 0.000000 2.000000 166.000000 1.000000 1.600000 2.000000 1.000000
max 303.000000 77.000000 1.000000 200.000000 564.000000 1.000000 2.000000 202.000000 1.000000 6.200000 3.000000 3.000000
df.index
RangeIndex(start=0, stop=303, step=1)
df.columns
Index(['Unnamed: 0', 'Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs',
       'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'AHD'],
      dtype='object')
# df.sort_index(axis=0, ascending=True)
# df.sort_index(axis=0, ascending=False)

df.sort_index(axis=1, ascending=True)
df.sort_index(axis=1, ascending=False)
Unnamed: 0 Thal Slope Sex RestECG RestBP Oldpeak MaxHR Fbs ExAng Chol ChestPain Ca Age AHD
0 1 fixed 3 1 2 145 2.3 150 1 0 233 typical 0.0 63 No
1 2 normal 2 1 2 160 1.5 108 0 1 286 asymptomatic 3.0 67 Yes
2 3 reversable 2 1 2 120 2.6 129 0 1 229 asymptomatic 2.0 67 Yes
3 4 normal 3 1 0 130 3.5 187 0 0 250 nonanginal 0.0 37 No
4 5 normal 1 0 2 130 1.4 172 0 0 204 nontypical 0.0 41 No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 299 reversable 2 1 0 110 1.2 132 0 0 264 typical 0.0 45 Yes
299 300 reversable 2 1 0 144 3.4 141 1 0 193 asymptomatic 2.0 68 Yes
300 301 reversable 2 1 0 130 1.2 115 0 1 131 asymptomatic 1.0 57 Yes
301 302 normal 2 0 2 130 0.0 174 0 0 236 nontypical 1.0 57 Yes
302 303 normal 1 1 0 138 0.0 173 0 0 175 nonanginal NaN 38 No

303 rows × 15 columns

df.sort_values(by = 'Sex', ascending = False)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
15 16 57 1 nonanginal 150 168 0 0 174 0 1.6 1 0.0 normal No
302 303 38 1 nonanginal 138 175 0 0 173 0 0.0 1 NaN normal No
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
291 292 55 0 nontypical 132 342 0 0 166 0 1.2 1 0.0 normal No
18 19 48 0 nonanginal 130 275 0 0 139 0 0.2 1 0.0 normal No
7 8 57 0 asymptomatic 120 354 0 0 163 1 0.6 1 0.0 normal No
6 7 62 0 asymptomatic 140 268 0 2 160 0 3.6 3 2.0 normal Yes
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No

303 rows × 15 columns

df.T
0 1 2 3 4 5 6 7 8 9 ... 293 294 295 296 297 298 299 300 301 302
Unnamed: 0 1 2 3 4 5 6 7 8 9 10 ... 294 295 296 297 298 299 300 301 302 303
Age 63 67 67 37 41 56 62 57 63 53 ... 63 63 41 59 57 45 68 57 57 38
Sex 1 1 1 1 0 1 0 0 1 1 ... 1 0 1 1 0 1 1 1 0 1
ChestPain typical asymptomatic asymptomatic nonanginal nontypical nontypical asymptomatic asymptomatic asymptomatic asymptomatic ... asymptomatic asymptomatic nontypical asymptomatic asymptomatic typical asymptomatic asymptomatic nontypical nonanginal
RestBP 145 160 120 130 130 120 140 120 130 140 ... 140 124 120 164 140 110 144 130 130 138
Chol 233 286 229 250 204 236 268 354 254 203 ... 187 197 157 176 241 264 193 131 236 175
Fbs 1 0 0 0 0 0 0 0 0 1 ... 0 0 0 1 0 0 1 0 0 0
RestECG 2 2 2 0 2 0 2 0 2 2 ... 2 0 0 2 0 0 0 0 2 0
MaxHR 150 108 129 187 172 178 160 163 147 155 ... 144 136 182 90 123 132 141 115 174 173
ExAng 0 1 1 0 0 0 0 1 0 1 ... 1 1 0 0 1 0 0 1 0 0
Oldpeak 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ... 4.0 0.0 0.0 1.0 0.2 1.2 3.4 1.2 0.0 0.0
Slope 3 2 2 3 1 1 3 1 2 3 ... 1 2 1 2 2 2 2 2 2 1
Ca 0.0 3.0 2.0 0.0 0.0 0.0 2.0 0.0 1.0 0.0 ... 2.0 0.0 0.0 2.0 0.0 0.0 2.0 1.0 1.0 NaN
Thal fixed normal reversable normal normal normal normal normal reversable reversable ... reversable normal normal fixed reversable reversable reversable reversable normal normal
AHD No Yes Yes No No No Yes No Yes Yes ... Yes Yes No Yes Yes Yes Yes Yes Yes No

15 rows × 303 columns

df['Age']
0      63
1      67
2      67
3      37
4      41
       ..
298    45
299    68
300    57
301    57
302    38
Name: Age, Length: 303, dtype: int64
df[['Age', 'Sex']]
Age Sex
0 63 1
1 67 1
2 67 1
3 37 1
4 41 0
... ... ...
298 45 1
299 68 1
300 57 1
301 57 0
302 38 1

303 rows × 2 columns

df[['Age', 'Sex']][0:2]
Age Sex
0 63 1
1 67 1
df[0:2][['Age']]
Age
0 63
1 67

Data Selection#

df.loc[4]
# df
Unnamed: 0             5
Age                   41
Sex                    0
ChestPain     nontypical
RestBP               130
Chol                 204
Fbs                    0
RestECG                2
MaxHR                172
ExAng                  0
Oldpeak              1.4
Slope                  1
Ca                   0.0
Thal              normal
AHD                   No
Name: 4, dtype: object
df.loc[[3,5], ['Age', 'Fbs']]
Age Fbs
3 37 0
5 56 0
df.iloc[5]
Unnamed: 0             6
Age                   56
Sex                    1
ChestPain     nontypical
RestBP               120
Chol                 236
Fbs                    0
RestECG                0
MaxHR                178
ExAng                  0
Oldpeak              0.8
Slope                  1
Ca                   0.0
Thal              normal
AHD                   No
Name: 5, dtype: object
df.iloc[5, 3]
'nontypical'
# df.iloc[5:7, 3:7]
# df.iloc[[5, 7, 9], 3:7]
# df.iloc[:, 3]
df.iloc[3: 5, :]
# df.loc[5:7, 3:7]
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
df.iloc[1, 1]
df.at[1, 'Age'] # Get scalar values. It's a very fast loc
df.iat[1, 1] # Get scalar values. It's a very fast iloc
np.int64(67)
df['Age'] > 50
0       True
1       True
2       True
3      False
4      False
       ...  
298    False
299     True
300     True
301     True
302    False
Name: Age, Length: 303, dtype: bool
df[df.Age > 50]
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
5 6 56 1 nontypical 120 236 0 0 178 0 0.8 1 0.0 normal No
6 7 62 0 asymptomatic 140 268 0 2 160 0 3.6 3 2.0 normal Yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
296 297 59 1 asymptomatic 164 176 1 2 90 0 1.0 2 2.0 fixed Yes
297 298 57 0 asymptomatic 140 241 0 0 123 1 0.2 2 0.0 reversable Yes
299 300 68 1 asymptomatic 144 193 1 0 141 0 3.4 2 2.0 reversable Yes
300 301 57 1 asymptomatic 130 131 0 0 115 1 1.2 2 1.0 reversable Yes
301 302 57 0 nontypical 130 236 0 2 174 0 0.0 2 1.0 normal Yes

209 rows × 15 columns

df2 = df.copy()
df2 = df2.iloc[0: 5, :]
df2
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
# dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column
df2['cp'] = ['three', 'two', 'one', 'one', 'zero']
df2
df2.cp.isin(['one', 'zero'])
0    False
1    False
2     True
3     True
4     True
Name: cp, dtype: bool
df2[df2.cp.isin(['one', 'zero'])]
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD cp
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes one
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No one
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No zero

Missing Data#

df1 = df.reindex(index=range(0, 4), columns=list(df.columns) + ['E'])
df1.loc[2:3, 'E'] = 1
df1
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No NaN
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes NaN
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes 1.0
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No 1.0
df1.dropna(how='any')
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes 1.0
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No 1.0
pd.isna(df1)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
0 False False False False False False False False False False False False False False False True
1 False False False False False False False False False False False False False False False True
2 False False False False False False False False False False False False False False False False
3 False False False False False False False False False False False False False False False False
df1.fillna(value=2)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No 2.0
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes 2.0
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes 1.0
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No 1.0

Operations#

df1.mean(numeric_only=True)
Unnamed: 0      2.500
Age            58.500
Sex             1.000
RestBP        138.750
Chol          249.500
Fbs             0.250
RestECG         1.500
MaxHR         143.500
ExAng           0.500
Oldpeak         2.475
Slope           2.500
Ca              1.250
E               1.000
dtype: float64
df1.mean(axis = 1, numeric_only=True)
0    50.108333
1    52.791667
2    43.046154
3    47.423077
dtype: float64
# df1.sub([10 for _ in range(4)], axis = 'index')
df1.mul([10 for _ in range(4)], axis = 'index')
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
0 10 630 10 typicaltypicaltypicaltypicaltypicaltypicaltypi... 1450 2330 10 20 1500 0 23.0 30 0.0 fixedfixedfixedfixedfixedfixedfixedfixedfixedf... NoNoNoNoNoNoNoNoNoNo NaN
1 20 670 10 asymptomaticasymptomaticasymptomaticasymptomat... 1600 2860 0 20 1080 10 15.0 20 30.0 normalnormalnormalnormalnormalnormalnormalnorm... YesYesYesYesYesYesYesYesYesYes NaN
2 30 670 10 asymptomaticasymptomaticasymptomaticasymptomat... 1200 2290 0 20 1290 10 26.0 20 20.0 reversablereversablereversablereversablerevers... YesYesYesYesYesYesYesYesYesYes 10.0
3 40 370 10 nonanginalnonanginalnonanginalnonanginalnonang... 1300 2500 0 0 1870 0 35.0 30 0.0 normalnormalnormalnormalnormalnormalnormalnorm... NoNoNoNoNoNoNoNoNoNo 10.0
df1
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No NaN
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes NaN
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes 1.0
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No 1.0
df1.apply(np.cumsum)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No NaN
1 3 130 2 typicalasymptomatic 305 519 1 4 258 1 3.8 5 3.0 fixednormal NoYes NaN
2 6 197 3 typicalasymptomaticasymptomatic 425 748 1 6 387 2 6.4 7 5.0 fixednormalreversable NoYesYes 1.0
3 10 234 4 typicalasymptomaticasymptomaticnonanginal 555 998 1 6 574 2 9.9 10 5.0 fixednormalreversablenormal NoYesYesNo 2.0
df1.Age.apply(lambda x: x*365.25)
0    23010.75
1    24471.75
2    24471.75
3    13514.25
Name: Age, dtype: float64
s = pd.Series(np.random.randint(0, 7, size=10))
s
0    0
1    0
2    2
3    5
4    3
5    1
6    2
7    1
8    0
9    4
dtype: int64
s.value_counts()
0    3
2    2
1    2
5    1
3    1
4    1
Name: count, dtype: int64
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s
0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object
s.str.lower()
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

Merge#

df2 = pd.DataFrame(np.random.randn(10, 4))
pieces = [df2[:3], df2[3:7], df2[7:]]
pieces
[          0         1         2         3
 0  1.713887 -0.783177 -0.870252  0.174835
 1  1.488003  0.873179 -0.011141 -0.379791
 2 -1.338324  0.206922 -1.007336  1.301022,
           0         1         2         3
 3 -1.145018  0.673621 -0.113262 -0.372458
 4 -0.158453  0.521651  0.385584 -0.787333
 5 -0.550533 -1.174613 -0.353579 -1.384881
 6  0.193290 -1.922045 -1.166896 -0.778034,
           0         1         2         3
 7 -0.287181  0.480147 -0.777828  0.614199
 8 -0.422591  1.321224  0.092496  0.125092
 9 -0.209045  1.201053  0.013136  0.131502]
pd.concat([pieces[2], pieces[1], pieces[0]])
0 1 2 3
7 -0.287181 0.480147 -0.777828 0.614199
8 -0.422591 1.321224 0.092496 0.125092
9 -0.209045 1.201053 0.013136 0.131502
3 -1.145018 0.673621 -0.113262 -0.372458
4 -0.158453 0.521651 0.385584 -0.787333
5 -0.550533 -1.174613 -0.353579 -1.384881
6 0.193290 -1.922045 -1.166896 -0.778034
0 1.713887 -0.783177 -0.870252 0.174835
1 1.488003 0.873179 -0.011141 -0.379791
2 -1.338324 0.206922 -1.007336 1.301022
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left
key lval
0 foo 1
1 foo 2
right
key rval
0 foo 4
1 foo 5
pd.merge(left, right, on='key')
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
pd.merge(left, right, on='key')
key lval rval
0 foo 1 4
1 bar 2 5
left
key lval
0 foo 1
1 bar 2
right
key rval
0 foo 4
1 bar 5
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df
A B C D
0 -1.372809 0.278032 -0.885468 1.619520
1 -1.013028 0.945348 -0.439954 0.996883
2 -0.544783 0.031908 -0.956627 -2.057062
3 -0.608442 -1.219639 0.687609 0.876758
4 0.853538 0.017766 0.947402 -0.702543
5 -0.133417 -0.286849 -0.341930 -0.654006
6 0.172827 -1.622207 1.467215 0.261368
7 -0.640789 -0.501795 0.948659 0.024495

Grouping#

df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                    'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                    'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

df
A B C D
0 foo one -0.300721 -0.747035
1 bar one -0.732933 0.185364
2 foo two -0.153550 1.316260
3 bar three -0.099058 -0.436411
4 foo two -1.168426 1.083197
5 bar two -0.572091 -0.005793
6 foo one -0.563868 1.556895
7 foo three -0.154034 -0.944887
df.groupby('A').sum()
B C D
A
bar onethreetwo -1.404083 -0.256840
foo onetwotwoonethree -2.340598 2.264429
df.groupby(['A', 'B']).sum()
C D
A B
bar one -0.732933 0.185364
three -0.099058 -0.436411
two -0.572091 -0.005793
foo one -0.864588 0.809859
three -0.154034 -0.944887
two -1.321976 2.399457

Plotting#

df.groupby('A').sum()['C'].plot(kind='bar')
<Axes: xlabel='A'>
_images/60d4894dd31cf804cd55e6298babe20139dc33e9ce1f49a2f99d65a65eebe44d.png
df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df2.plot.bar()
<Axes: >
_images/a546d613dabc6489c730f137ab7016fe8d44eff7e1dec8a628c5fc26ebe47e2e.png
df2.plot.bar(stacked=True);
_images/8808d2588b93821fd2a9e7851bc18c4d33bcbb49dad2884a7c4acddeaafdfca1.png
df2.plot.barh(stacked=True)
<Axes: >
_images/1da6472319fbc861bd49f5267f4bf654727822900094e523a445b648c23e9655.png

File I/O#

df = pd.read_csv('heart.csv')
df.head()
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
# df.to_csv('heart_copy.csv', index = False)
# df1 = df[:10]
# df2 = df[10:20]
# df3 = df[20:40]
# with pd.ExcelWriter('path_to_file.xlsx') as writer:
#     df1.to_excel(writer, sheet_name='Sheet1', index = False)
#     df2.to_excel(writer, sheet_name='Sheet2', index = False)
#     df3.to_excel(writer, sheet_name='Sheet3')