Using pandas#

import pandas as pd
import numpy as np # for generating random numbers

Series#

# series
s1 = pd.Series([10, 11, 12], dtype = float)
s1
0    10.0
1    11.0
2    12.0
dtype: float64
s2 = pd.Series([10, 11, 12], dtype = int)
s2
0    10
1    11
2    12
dtype: int64
s3 = pd.Series(["10", "11", "12"], name = 'Val', index = pd.Index(['a', 'b', 'c'], name = 'label'))
s3
label
a    10
b    11
c    12
Name: Val, dtype: object
s1.index
RangeIndex(start=0, stop=3, step=1)
# s1.append(s2)
s4 = pd.concat([s1, s3])
s4
0    10.0
1    11.0
2    12.0
a      10
b      11
c      12
dtype: object
print(s4[0])
print(s4)
s4_reset = s4.reset_index(drop=True)
print(s4_reset)
s4_reset[0]
10.0
0    10.0
1    11.0
2    12.0
a      10
b      11
c      12
dtype: object
0    10.0
1    11.0
2    12.0
3      10
4      11
5      12
dtype: object
10.0
dates = pd.date_range('20190503', periods=7)

dates
DatetimeIndex(['2019-05-03', '2019-05-04', '2019-05-05', '2019-05-06',
               '2019-05-07', '2019-05-08', '2019-05-09'],
              dtype='datetime64[ns]', freq='D')

DataFrame#

df = pd.DataFrame(np.random.randn(7, 4), index=dates, columns=list('ABCD'))
df
A B C D
2019-05-03 1.106142 -1.411186 -0.664336 -0.998144
2019-05-04 -0.219846 -0.562399 0.882633 -0.183028
2019-05-05 0.130602 -0.559537 -0.683457 1.393433
2019-05-06 2.234397 1.709079 0.610843 0.119211
2019-05-07 0.447079 -0.928171 1.102700 0.010732
2019-05-08 -0.406803 1.557577 -1.035966 -0.112492
2019-05-09 -0.767276 -0.932870 -1.022550 0.422754
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(2, 6)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2
A B C D E F
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo
4 1.0 2013-01-02 1.0 3 test foo
5 1.0 2013-01-02 1.0 3 train foo
df2.dtypes
A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object
df = pd.read_csv('heart.csv')
df
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 299 45 1 typical 110 264 0 0 132 0 1.2 2 0.0 reversable Yes
299 300 68 1 asymptomatic 144 193 1 0 141 0 3.4 2 2.0 reversable Yes
300 301 57 1 asymptomatic 130 131 0 0 115 1 1.2 2 1.0 reversable Yes
301 302 57 0 nontypical 130 236 0 2 174 0 0.0 2 1.0 normal Yes
302 303 38 1 nonanginal 138 175 0 0 173 0 0.0 1 NaN normal No

303 rows × 15 columns

Viewing Data#

df.head(10)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
5 6 56 1 nontypical 120 236 0 0 178 0 0.8 1 0.0 normal No
6 7 62 0 asymptomatic 140 268 0 2 160 0 3.6 3 2.0 normal Yes
7 8 57 0 asymptomatic 120 354 0 0 163 1 0.6 1 0.0 normal No
8 9 63 1 asymptomatic 130 254 0 2 147 0 1.4 2 1.0 reversable Yes
9 10 53 1 asymptomatic 140 203 1 2 155 1 3.1 3 0.0 reversable Yes
df.tail(8)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
295 296 41 1 nontypical 120 157 0 0 182 0 0.0 1 0.0 normal No
296 297 59 1 asymptomatic 164 176 1 2 90 0 1.0 2 2.0 fixed Yes
297 298 57 0 asymptomatic 140 241 0 0 123 1 0.2 2 0.0 reversable Yes
298 299 45 1 typical 110 264 0 0 132 0 1.2 2 0.0 reversable Yes
299 300 68 1 asymptomatic 144 193 1 0 141 0 3.4 2 2.0 reversable Yes
300 301 57 1 asymptomatic 130 131 0 0 115 1 1.2 2 1.0 reversable Yes
301 302 57 0 nontypical 130 236 0 2 174 0 0.0 2 1.0 normal Yes
302 303 38 1 nonanginal 138 175 0 0 173 0 0.0 1 NaN normal No
df.describe()
Unnamed: 0 Age Sex RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca
count 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 303.000000 299.000000
mean 152.000000 54.438944 0.679868 131.689769 246.693069 0.148515 0.990099 149.607261 0.326733 1.039604 1.600660 0.672241
std 87.612784 9.038662 0.467299 17.599748 51.776918 0.356198 0.994971 22.875003 0.469794 1.161075 0.616226 0.937438
min 1.000000 29.000000 0.000000 94.000000 126.000000 0.000000 0.000000 71.000000 0.000000 0.000000 1.000000 0.000000
25% 76.500000 48.000000 0.000000 120.000000 211.000000 0.000000 0.000000 133.500000 0.000000 0.000000 1.000000 0.000000
50% 152.000000 56.000000 1.000000 130.000000 241.000000 0.000000 1.000000 153.000000 0.000000 0.800000 2.000000 0.000000
75% 227.500000 61.000000 1.000000 140.000000 275.000000 0.000000 2.000000 166.000000 1.000000 1.600000 2.000000 1.000000
max 303.000000 77.000000 1.000000 200.000000 564.000000 1.000000 2.000000 202.000000 1.000000 6.200000 3.000000 3.000000
df.index
RangeIndex(start=0, stop=303, step=1)
df.columns
Index(['Unnamed: 0', 'Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs',
       'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'AHD'],
      dtype='object')
# df.sort_index(axis=0, ascending=True)
# df.sort_index(axis=0, ascending=False)

df.sort_index(axis=1, ascending=True)
df.sort_index(axis=1, ascending=False)
Unnamed: 0 Thal Slope Sex RestECG RestBP Oldpeak MaxHR Fbs ExAng Chol ChestPain Ca Age AHD
0 1 fixed 3 1 2 145 2.3 150 1 0 233 typical 0.0 63 No
1 2 normal 2 1 2 160 1.5 108 0 1 286 asymptomatic 3.0 67 Yes
2 3 reversable 2 1 2 120 2.6 129 0 1 229 asymptomatic 2.0 67 Yes
3 4 normal 3 1 0 130 3.5 187 0 0 250 nonanginal 0.0 37 No
4 5 normal 1 0 2 130 1.4 172 0 0 204 nontypical 0.0 41 No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
298 299 reversable 2 1 0 110 1.2 132 0 0 264 typical 0.0 45 Yes
299 300 reversable 2 1 0 144 3.4 141 1 0 193 asymptomatic 2.0 68 Yes
300 301 reversable 2 1 0 130 1.2 115 0 1 131 asymptomatic 1.0 57 Yes
301 302 normal 2 0 2 130 0.0 174 0 0 236 nontypical 1.0 57 Yes
302 303 normal 1 1 0 138 0.0 173 0 0 175 nonanginal NaN 38 No

303 rows × 15 columns

df.sort_values(by = 'Sex', ascending = False)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
302 303 38 1 nonanginal 138 175 0 0 173 0 0.0 1 NaN normal No
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
26 27 58 0 nonanginal 120 340 0 0 172 0 0.0 1 0.0 normal No
27 28 66 0 typical 150 226 0 0 114 0 2.6 3 0.0 normal No
30 31 69 0 typical 140 239 0 0 151 0 1.8 1 2.0 normal No
40 41 65 0 asymptomatic 150 225 0 2 114 0 1.0 2 3.0 reversable Yes
42 43 71 0 nontypical 160 302 0 0 162 0 0.4 1 2.0 normal No

303 rows × 15 columns

df.T
0 1 2 3 4 5 6 7 8 9 ... 293 294 295 296 297 298 299 300 301 302
Unnamed: 0 1 2 3 4 5 6 7 8 9 10 ... 294 295 296 297 298 299 300 301 302 303
Age 63 67 67 37 41 56 62 57 63 53 ... 63 63 41 59 57 45 68 57 57 38
Sex 1 1 1 1 0 1 0 0 1 1 ... 1 0 1 1 0 1 1 1 0 1
ChestPain typical asymptomatic asymptomatic nonanginal nontypical nontypical asymptomatic asymptomatic asymptomatic asymptomatic ... asymptomatic asymptomatic nontypical asymptomatic asymptomatic typical asymptomatic asymptomatic nontypical nonanginal
RestBP 145 160 120 130 130 120 140 120 130 140 ... 140 124 120 164 140 110 144 130 130 138
Chol 233 286 229 250 204 236 268 354 254 203 ... 187 197 157 176 241 264 193 131 236 175
Fbs 1 0 0 0 0 0 0 0 0 1 ... 0 0 0 1 0 0 1 0 0 0
RestECG 2 2 2 0 2 0 2 0 2 2 ... 2 0 0 2 0 0 0 0 2 0
MaxHR 150 108 129 187 172 178 160 163 147 155 ... 144 136 182 90 123 132 141 115 174 173
ExAng 0 1 1 0 0 0 0 1 0 1 ... 1 1 0 0 1 0 0 1 0 0
Oldpeak 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ... 4.0 0.0 0.0 1.0 0.2 1.2 3.4 1.2 0.0 0.0
Slope 3 2 2 3 1 1 3 1 2 3 ... 1 2 1 2 2 2 2 2 2 1
Ca 0.0 3.0 2.0 0.0 0.0 0.0 2.0 0.0 1.0 0.0 ... 2.0 0.0 0.0 2.0 0.0 0.0 2.0 1.0 1.0 NaN
Thal fixed normal reversable normal normal normal normal normal reversable reversable ... reversable normal normal fixed reversable reversable reversable reversable normal normal
AHD No Yes Yes No No No Yes No Yes Yes ... Yes Yes No Yes Yes Yes Yes Yes Yes No

15 rows × 303 columns

df['Age']
0      63
1      67
2      67
3      37
4      41
       ..
298    45
299    68
300    57
301    57
302    38
Name: Age, Length: 303, dtype: int64
df[['Age', 'Sex']]
Age Sex
0 63 1
1 67 1
2 67 1
3 37 1
4 41 0
... ... ...
298 45 1
299 68 1
300 57 1
301 57 0
302 38 1

303 rows × 2 columns

df[['Age', 'Sex']][0:2]
Age Sex
0 63 1
1 67 1
df[0:2][['Age']]
Age
0 63
1 67

Data Selection#

df.loc[4]
# df
Unnamed: 0             5
Age                   41
Sex                    0
ChestPain     nontypical
RestBP               130
Chol                 204
Fbs                    0
RestECG                2
MaxHR                172
ExAng                  0
Oldpeak              1.4
Slope                  1
Ca                   0.0
Thal              normal
AHD                   No
Name: 4, dtype: object
df.loc[[3,5], ['Age', 'Fbs']]
Age Fbs
3 37 0
5 56 0
df.iloc[5]
Unnamed: 0             6
Age                   56
Sex                    1
ChestPain     nontypical
RestBP               120
Chol                 236
Fbs                    0
RestECG                0
MaxHR                178
ExAng                  0
Oldpeak              0.8
Slope                  1
Ca                   0.0
Thal              normal
AHD                   No
Name: 5, dtype: object
df.iloc[5, 3]
'nontypical'
# df.iloc[5:7, 3:7]
# df.iloc[[5, 7, 9], 3:7]
# df.iloc[:, 3]
df.iloc[3: 5, :]
# df.loc[5:7, 3:7]
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
df.iloc[1, 1]
df.at[1, 'Age'] # Get scalar values. It's a very fast loc
df.iat[1, 1] # Get scalar values. It's a very fast iloc
67
df['Age'] > 50
0       True
1       True
2       True
3      False
4      False
       ...  
298    False
299     True
300     True
301     True
302    False
Name: Age, Length: 303, dtype: bool
df[df.Age > 50]
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
5 6 56 1 nontypical 120 236 0 0 178 0 0.8 1 0.0 normal No
6 7 62 0 asymptomatic 140 268 0 2 160 0 3.6 3 2.0 normal Yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
296 297 59 1 asymptomatic 164 176 1 2 90 0 1.0 2 2.0 fixed Yes
297 298 57 0 asymptomatic 140 241 0 0 123 1 0.2 2 0.0 reversable Yes
299 300 68 1 asymptomatic 144 193 1 0 141 0 3.4 2 2.0 reversable Yes
300 301 57 1 asymptomatic 130 131 0 0 115 1 1.2 2 1.0 reversable Yes
301 302 57 0 nontypical 130 236 0 2 174 0 0.0 2 1.0 normal Yes

209 rows × 15 columns

df2 = df.copy()
df2 = df2.iloc[0: 5, :]
df2
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No
# dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column
df2['cp'] = ['three', 'two', 'one', 'one', 'zero']
df2
df2.cp.isin(['one', 'zero'])
0    False
1    False
2     True
3     True
4     True
Name: cp, dtype: bool
df2[df2.cp.isin(['one', 'zero'])]
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD cp
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes one
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No one
4 5 41 0 nontypical 130 204 0 2 172 0 1.4 1 0.0 normal No zero

Missing Data#

df1 = df.reindex(index=range(0, 4), columns=list(df.columns) + ['E'])
df1.loc[2:3, 'E'] = 1
df1
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No NaN
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes NaN
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes 1.0
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No 1.0
df1.dropna(how='any')
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes 1.0
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No 1.0
pd.isna(df1)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
0 False False False False False False False False False False False False False False False True
1 False False False False False False False False False False False False False False False True
2 False False False False False False False False False False False False False False False False
3 False False False False False False False False False False False False False False False False
df1.fillna(value=2)
Unnamed: 0 Age Sex ChestPain RestBP Chol Fbs RestECG MaxHR ExAng Oldpeak Slope Ca Thal AHD E
0 1 63 1 typical 145 233 1 2 150 0 2.3 3 0.0 fixed No 2.0
1 2 67 1 asymptomatic 160 286 0 2 108 1 1.5 2 3.0 normal Yes 2.0
2 3 67 1 asymptomatic 120 229 0 2 129 1 2.6 2 2.0 reversable Yes 1.0
3 4 37 1 nonanginal 130 250 0 0 187 0 3.5 3 0.0 normal No 1.0

Operations#

df1.mean()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[40], line 1
----> 1 df1.mean()

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/frame.py:11335, in DataFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11327 @doc(make_doc("mean", ndim=2))
  11328 def mean(
  11329     self,
   (...)
  11333     **kwargs,
  11334 ):
> 11335     result = super().mean(axis, skipna, numeric_only, **kwargs)
  11336     if isinstance(result, Series):
  11337         result = result.__finalize__(self, method="mean")

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/generic.py:11984, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11977 def mean(
  11978     self,
  11979     axis: Axis | None = 0,
   (...)
  11982     **kwargs,
  11983 ) -> Series | float:
> 11984     return self._stat_function(
  11985         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  11986     )

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/generic.py:11941, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11937 nv.validate_func(name, (), kwargs)
  11939 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11941 return self._reduce(
  11942     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11943 )

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/frame.py:11204, in DataFrame._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
  11200     df = df.T
  11202 # After possibly _get_data and transposing, we are now in the
  11203 #  simple case where we can use BlockManager.reduce
> 11204 res = df._mgr.reduce(blk_func)
  11205 out = df._constructor_from_mgr(res, axes=res.axes).iloc[0]
  11206 if out_dtype is not None and out.dtype != "boolean":

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:1459, in BlockManager.reduce(self, func)
   1457 res_blocks: list[Block] = []
   1458 for blk in self.blocks:
-> 1459     nbs = blk.reduce(func)
   1460     res_blocks.extend(nbs)
   1462 index = Index([None])  # placeholder

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/internals/blocks.py:377, in Block.reduce(self, func)
    371 @final
    372 def reduce(self, func) -> list[Block]:
    373     # We will apply the function and reshape the result into a single-row
    374     #  Block with the same mgr_locs; squeezing will be done at a higher level
    375     assert self.ndim == 2
--> 377     result = func(self.values)
    379     if self.values.ndim == 1:
    380         res_values = result

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/frame.py:11136, in DataFrame._reduce.<locals>.blk_func(values, axis)
  11134         return np.array([result])
  11135 else:
> 11136     return op(values, axis=axis, skipna=skipna, **kwds)

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    145         result = alt(values, axis=axis, skipna=skipna, **kwds)
    146 else:
--> 147     result = alt(values, axis=axis, skipna=skipna, **kwds)
    149 return result

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    401 if datetimelike and mask is None:
    402     mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    406 if datetimelike:
    407     result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:720, in nanmean(values, axis, skipna, mask)
    718 count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
    719 the_sum = values.sum(axis, dtype=dtype_sum)
--> 720 the_sum = _ensure_numeric(the_sum)
    722 if axis is not None and getattr(the_sum, "ndim", False):
    723     count = cast(np.ndarray, count)

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:1678, in _ensure_numeric(x)
   1675 inferred = lib.infer_dtype(x)
   1676 if inferred in ["string", "mixed"]:
   1677     # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1678     raise TypeError(f"Could not convert {x} to numeric")
   1679 try:
   1680     x = x.astype(np.complex128)

TypeError: Could not convert ['typicalasymptomaticasymptomaticnonanginal' 'fixednormalreversablenormal'
 'NoYesYesNo'] to numeric
df1.mean(axis = 1)
# df1.sub([10 for _ in range(4)], axis = 'index')
df1.mul([10 for _ in range(4)], axis = 'index')
df1
df1.apply(np.cumsum)
df1.Age.apply(lambda x: x*365.25)
s = pd.Series(np.random.randint(0, 7, size=10))
s
s.value_counts()
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s
s.str.lower()

Merge#

df2 = pd.DataFrame(np.random.randn(10, 4))
pieces = [df2[:3], df2[3:7], df2[7:]]

pieces

pd.concat([pieces[2], pieces[1], pieces[0]])
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left
right
pd.merge(left, right, on='key')
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
pd.merge(left, right, on='key')
left
right
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
# df.append(df.iloc[3])
df.append(df.iloc[3], ignore_index = True)

Grouping#

df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                    'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                    'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

df
df.groupby('A').sum()
df.groupby(['A', 'B']).sum()

Plotting#

df.groupby('A').sum()['C'].plot(kind='bar')
df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df2.plot.bar()
df2.plot.bar(stacked=True);
df2.plot.barh(stacked=True)

File I/O#

df = pd.read_csv('heart.csv')
df.head()
# df.to_csv('heart_copy.csv', index = False)
# df1 = df[:10]
# df2 = df[10:20]
# df3 = df[20:40]
# with pd.ExcelWriter('path_to_file.xlsx') as writer:
#     df1.to_excel(writer, sheet_name='Sheet1', index = False)
#     df2.to_excel(writer, sheet_name='Sheet2', index = False)
#     df3.to_excel(writer, sheet_name='Sheet3')