Using pandas

Using pandas#

import pandas as pd
import numpy as np # for generating random numbers

Series#

# series
s1 = pd.Series([10, 11, 12], dtype = float)
s1

  10.0
  11.0
  12.0
dtype: float64

s2 = pd.Series([10, 11, 12], dtype = int)
s2

  10
  11
  12
dtype: int64

s3 = pd.Series(["10", "11", "12"], name = 'Val', index = pd.Index(['a', 'b', 'c'], name = 'label'))
s3

label
a    10
b    11
c    12
Name: Val, dtype: object

s1.index

RangeIndex(start=0, stop=3, step=1)

# s1.append(s2)
s4 = pd.concat([s1, s3])
s4

0    10.0
1    11.0
2    12.0
a      10
b      11
c      12
dtype: object

print(s4[0])
print(s4)
s4_reset = s4.reset_index(drop=True)
print(s4_reset)
s4_reset[0]

10.0
0    10.0
1    11.0
2    12.0
a      10
b      11
c      12
dtype: object
0    10.0
1    11.0
2    12.0
3      10
4      11
5      12
dtype: object

10.0

dates = pd.date_range('20190503', periods=7)

dates

DatetimeIndex(['2019-05-03', '2019-05-04', '2019-05-05', '2019-05-06',
               '2019-05-07', '2019-05-08', '2019-05-09'],
              dtype='datetime64[ns]', freq='D')

DataFrame#

df = pd.DataFrame(np.random.randn(7, 4), index=dates, columns=list('ABCD'))
df

	A	B	C	D
2019-05-03	1.106142	-1.411186	-0.664336	-0.998144
2019-05-04	-0.219846	-0.562399	0.882633	-0.183028
2019-05-05	0.130602	-0.559537	-0.683457	1.393433
2019-05-06	2.234397	1.709079	0.610843	0.119211
2019-05-07	0.447079	-0.928171	1.102700	0.010732
2019-05-08	-0.406803	1.557577	-1.035966	-0.112492
2019-05-09	-0.767276	-0.932870	-1.022550	0.422754

df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(2, 6)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2

	A	B	C	D	E	F
2	1.0	2013-01-02	1.0	3	test	foo
3	1.0	2013-01-02	1.0	3	train	foo
4	1.0	2013-01-02	1.0	3	test	foo
5	1.0	2013-01-02	1.0	3	train	foo

df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

df = pd.read_csv('heart.csv')
df

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD
0	1	63	1	typical	145	233	1	2	150	0	2.3	3	0.0	fixed	No
1	2	67	1	asymptomatic	160	286	0	2	108	1	1.5	2	3.0	normal	Yes
2	3	67	1	asymptomatic	120	229	0	2	129	1	2.6	2	2.0	reversable	Yes
3	4	37	1	nonanginal	130	250	0	0	187	0	3.5	3	0.0	normal	No
4	5	41	0	nontypical	130	204	0	2	172	0	1.4	1	0.0	normal	No
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	299	45	1	typical	110	264	0	0	132	0	1.2	2	0.0	reversable	Yes
299	300	68	1	asymptomatic	144	193	1	0	141	0	3.4	2	2.0	reversable	Yes
300	301	57	1	asymptomatic	130	131	0	0	115	1	1.2	2	1.0	reversable	Yes
301	302	57	0	nontypical	130	236	0	2	174	0	0.0	2	1.0	normal	Yes
302	303	38	1	nonanginal	138	175	0	0	173	0	0.0	1	NaN	normal	No

303 rows × 15 columns

Viewing Data#

df.head(10)

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD
0	1	63	1	typical	145	233	1	2	150	0	2.3	3	0.0	fixed	No
1	2	67	1	asymptomatic	160	286	0	2	108	1	1.5	2	3.0	normal	Yes
2	3	67	1	asymptomatic	120	229	0	2	129	1	2.6	2	2.0	reversable	Yes
3	4	37	1	nonanginal	130	250	0	0	187	0	3.5	3	0.0	normal	No
4	5	41	0	nontypical	130	204	0	2	172	0	1.4	1	0.0	normal	No
5	6	56	1	nontypical	120	236	0	0	178	0	0.8	1	0.0	normal	No
6	7	62	0	asymptomatic	140	268	0	2	160	0	3.6	3	2.0	normal	Yes
7	8	57	0	asymptomatic	120	354	0	0	163	1	0.6	1	0.0	normal	No
8	9	63	1	asymptomatic	130	254	0	2	147	0	1.4	2	1.0	reversable	Yes
9	10	53	1	asymptomatic	140	203	1	2	155	1	3.1	3	0.0	reversable	Yes

df.tail(8)

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD
295	296	41	1	nontypical	120	157	0	0	182	0	0.0	1	0.0	normal	No
296	297	59	1	asymptomatic	164	176	1	2	90	0	1.0	2	2.0	fixed	Yes
297	298	57	0	asymptomatic	140	241	0	0	123	1	0.2	2	0.0	reversable	Yes
298	299	45	1	typical	110	264	0	0	132	0	1.2	2	0.0	reversable	Yes
299	300	68	1	asymptomatic	144	193	1	0	141	0	3.4	2	2.0	reversable	Yes
300	301	57	1	asymptomatic	130	131	0	0	115	1	1.2	2	1.0	reversable	Yes
301	302	57	0	nontypical	130	236	0	2	174	0	0.0	2	1.0	normal	Yes
302	303	38	1	nonanginal	138	175	0	0	173	0	0.0	1	NaN	normal	No

df.describe()

	Unnamed: 0	Age	Sex	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca
count	303.000000	303.000000	303.000000	303.000000	303.000000	303.000000	303.000000	303.000000	303.000000	303.000000	303.000000	299.000000
mean	152.000000	54.438944	0.679868	131.689769	246.693069	0.148515	0.990099	149.607261	0.326733	1.039604	1.600660	0.672241
std	87.612784	9.038662	0.467299	17.599748	51.776918	0.356198	0.994971	22.875003	0.469794	1.161075	0.616226	0.937438
min	1.000000	29.000000	0.000000	94.000000	126.000000	0.000000	0.000000	71.000000	0.000000	0.000000	1.000000	0.000000
25%	76.500000	48.000000	0.000000	120.000000	211.000000	0.000000	0.000000	133.500000	0.000000	0.000000	1.000000	0.000000
50%	152.000000	56.000000	1.000000	130.000000	241.000000	0.000000	1.000000	153.000000	0.000000	0.800000	2.000000	0.000000
75%	227.500000	61.000000	1.000000	140.000000	275.000000	0.000000	2.000000	166.000000	1.000000	1.600000	2.000000	1.000000
max	303.000000	77.000000	1.000000	200.000000	564.000000	1.000000	2.000000	202.000000	1.000000	6.200000	3.000000	3.000000

df.index

RangeIndex(start=0, stop=303, step=1)

df.columns

Index(['Unnamed: 0', 'Age', 'Sex', 'ChestPain', 'RestBP', 'Chol', 'Fbs',
       'RestECG', 'MaxHR', 'ExAng', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'AHD'],
      dtype='object')

# df.sort_index(axis=0, ascending=True)
# df.sort_index(axis=0, ascending=False)

df.sort_index(axis=1, ascending=True)
df.sort_index(axis=1, ascending=False)

	Unnamed: 0	Thal	Slope	Sex	RestECG	RestBP	Oldpeak	MaxHR	Fbs	ExAng	Chol	ChestPain	Ca	Age	AHD
0	1	fixed	3	1	2	145	2.3	150	1	0	233	typical	0.0	63	No
1	2	normal	2	1	2	160	1.5	108	0	1	286	asymptomatic	3.0	67	Yes
2	3	reversable	2	1	2	120	2.6	129	0	1	229	asymptomatic	2.0	67	Yes
3	4	normal	3	1	0	130	3.5	187	0	0	250	nonanginal	0.0	37	No
4	5	normal	1	0	2	130	1.4	172	0	0	204	nontypical	0.0	41	No
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
298	299	reversable	2	1	0	110	1.2	132	0	0	264	typical	0.0	45	Yes
299	300	reversable	2	1	0	144	3.4	141	1	0	193	asymptomatic	2.0	68	Yes
300	301	reversable	2	1	0	130	1.2	115	0	1	131	asymptomatic	1.0	57	Yes
301	302	normal	2	0	2	130	0.0	174	0	0	236	nontypical	1.0	57	Yes
302	303	normal	1	1	0	138	0.0	173	0	0	175	nonanginal	NaN	38	No

303 rows × 15 columns

df.sort_values(by = 'Sex', ascending = False)

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD
302	303	38	1	nonanginal	138	175	0	0	173	0	0.0	1	NaN	normal	No
0	1	63	1	typical	145	233	1	2	150	0	2.3	3	0.0	fixed	No
1	2	67	1	asymptomatic	160	286	0	2	108	1	1.5	2	3.0	normal	Yes
2	3	67	1	asymptomatic	120	229	0	2	129	1	2.6	2	2.0	reversable	Yes
3	4	37	1	nonanginal	130	250	0	0	187	0	3.5	3	0.0	normal	No
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
26	27	58	0	nonanginal	120	340	0	0	172	0	0.0	1	0.0	normal	No
27	28	66	0	typical	150	226	0	0	114	0	2.6	3	0.0	normal	No
30	31	69	0	typical	140	239	0	0	151	0	1.8	1	2.0	normal	No
40	41	65	0	asymptomatic	150	225	0	2	114	0	1.0	2	3.0	reversable	Yes
42	43	71	0	nontypical	160	302	0	0	162	0	0.4	1	2.0	normal	No

303 rows × 15 columns

df.T

	0	1	2	3	4	5	6	7	8	9	...	293	294	295	296	297	298	299	300	301	302
Unnamed: 0	1	2	3	4	5	6	7	8	9	10	...	294	295	296	297	298	299	300	301	302	303
Age	63	67	67	37	41	56	62	57	63	53	...	63	63	41	59	57	45	68	57	57	38
Sex	1	1	1	1	0	1	0	0	1	1	...	1	0	1	1	0	1	1	1	0	1
ChestPain	typical	asymptomatic	asymptomatic	nonanginal	nontypical	nontypical	asymptomatic	asymptomatic	asymptomatic	asymptomatic	...	asymptomatic	asymptomatic	nontypical	asymptomatic	asymptomatic	typical	asymptomatic	asymptomatic	nontypical	nonanginal
RestBP	145	160	120	130	130	120	140	120	130	140	...	140	124	120	164	140	110	144	130	130	138
Chol	233	286	229	250	204	236	268	354	254	203	...	187	197	157	176	241	264	193	131	236	175
Fbs	1	0	0	0	0	0	0	0	0	1	...	0	0	0	1	0	0	1	0	0	0
RestECG	2	2	2	0	2	0	2	0	2	2	...	2	0	0	2	0	0	0	0	2	0
MaxHR	150	108	129	187	172	178	160	163	147	155	...	144	136	182	90	123	132	141	115	174	173
ExAng	0	1	1	0	0	0	0	1	0	1	...	1	1	0	0	1	0	0	1	0	0
Oldpeak	2.3	1.5	2.6	3.5	1.4	0.8	3.6	0.6	1.4	3.1	...	4.0	0.0	0.0	1.0	0.2	1.2	3.4	1.2	0.0	0.0
Slope	3	2	2	3	1	1	3	1	2	3	...	1	2	1	2	2	2	2	2	2	1
Ca	0.0	3.0	2.0	0.0	0.0	0.0	2.0	0.0	1.0	0.0	...	2.0	0.0	0.0	2.0	0.0	0.0	2.0	1.0	1.0	NaN
Thal	fixed	normal	reversable	normal	normal	normal	normal	normal	reversable	reversable	...	reversable	normal	normal	fixed	reversable	reversable	reversable	reversable	normal	normal
AHD	No	Yes	Yes	No	No	No	Yes	No	Yes	Yes	...	Yes	Yes	No	Yes	Yes	Yes	Yes	Yes	Yes	No

15 rows × 303 columns

df['Age']

    63
    67
    67
    37
    41
       ..
  45
  68
  57
  57
  38
Name: Age, Length: 303, dtype: int64

df[['Age', 'Sex']]

	Age	Sex
0	63	1
1	67	1
2	67	1
3	37	1
4	41	0
...	...	...
298	45	1
299	68	1
300	57	1
301	57	0
302	38	1

303 rows × 2 columns

df[['Age', 'Sex']][0:2]

	Age	Sex
0	63	1
1	67	1

df[0:2][['Age']]

	Age
0	63
1	67

Data Selection#

df.loc[4]
# df

Unnamed: 0             5
Age                   41
Sex                    0
ChestPain     nontypical
RestBP               130
Chol                 204
Fbs                    0
RestECG                2
MaxHR                172
ExAng                  0
Oldpeak              1.4
Slope                  1
Ca                   0.0
Thal              normal
AHD                   No
Name: 4, dtype: object

df.loc[[3,5], ['Age', 'Fbs']]

	Age	Fbs
3	37	0
5	56	0

df.iloc[5]

Unnamed: 0             6
Age                   56
Sex                    1
ChestPain     nontypical
RestBP               120
Chol                 236
Fbs                    0
RestECG                0
MaxHR                178
ExAng                  0
Oldpeak              0.8
Slope                  1
Ca                   0.0
Thal              normal
AHD                   No
Name: 5, dtype: object

df.iloc[5, 3]

'nontypical'

# df.iloc[5:7, 3:7]
# df.iloc[[5, 7, 9], 3:7]
# df.iloc[:, 3]
df.iloc[3: 5, :]
# df.loc[5:7, 3:7]

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD
3	4	37	1	nonanginal	130	250	0	0	187	0	3.5	3	0.0	normal	No
4	5	41	0	nontypical	130	204	0	2	172	0	1.4	1	0.0	normal	No

df.iloc[1, 1]
df.at[1, 'Age'] # Get scalar values. It's a very fast loc
df.iat[1, 1] # Get scalar values. It's a very fast iloc

df['Age'] > 50

     True
     True
     True
    False
    False
       ...  
  False
   True
   True
   True
  False
Name: Age, Length: 303, dtype: bool

df[df.Age > 50]

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD
0	1	63	1	typical	145	233	1	2	150	0	2.3	3	0.0	fixed	No
1	2	67	1	asymptomatic	160	286	0	2	108	1	1.5	2	3.0	normal	Yes
2	3	67	1	asymptomatic	120	229	0	2	129	1	2.6	2	2.0	reversable	Yes
5	6	56	1	nontypical	120	236	0	0	178	0	0.8	1	0.0	normal	No
6	7	62	0	asymptomatic	140	268	0	2	160	0	3.6	3	2.0	normal	Yes
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
296	297	59	1	asymptomatic	164	176	1	2	90	0	1.0	2	2.0	fixed	Yes
297	298	57	0	asymptomatic	140	241	0	0	123	1	0.2	2	0.0	reversable	Yes
299	300	68	1	asymptomatic	144	193	1	0	141	0	3.4	2	2.0	reversable	Yes
300	301	57	1	asymptomatic	130	131	0	0	115	1	1.2	2	1.0	reversable	Yes
301	302	57	0	nontypical	130	236	0	2	174	0	0.0	2	1.0	normal	Yes

209 rows × 15 columns

df2 = df.copy()
df2 = df2.iloc[0: 5, :]
df2

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD
0	1	63	1	typical	145	233	1	2	150	0	2.3	3	0.0	fixed	No
1	2	67	1	asymptomatic	160	286	0	2	108	1	1.5	2	3.0	normal	Yes
2	3	67	1	asymptomatic	120	229	0	2	129	1	2.6	2	2.0	reversable	Yes
3	4	37	1	nonanginal	130	250	0	0	187	0	3.5	3	0.0	normal	No
4	5	41	0	nontypical	130	204	0	2	172	0	1.4	1	0.0	normal	No

# dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column
df2['cp'] = ['three', 'two', 'one', 'one', 'zero']
df2
df2.cp.isin(['one', 'zero'])

  False
  False
   True
   True
   True
Name: cp, dtype: bool

df2[df2.cp.isin(['one', 'zero'])]

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD	cp
2	3	67	1	asymptomatic	120	229	2	129	1	2.6	2	2.0	reversable	Yes	one
3	4	37	1	nonanginal	130	250	0	187	0	3.5	3	0.0	normal	No	one
4	5	41	0	nontypical	130	204	2	172	0	1.4	1	0.0	normal	No	zero

Missing Data#

df1 = df.reindex(index=range(0, 4), columns=list(df.columns) + ['E'])
df1.loc[2:3, 'E'] = 1
df1

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD	E
0	1	63	1	typical	145	233	1	2	150	0	2.3	3	0.0	fixed	No	NaN
1	2	67	1	asymptomatic	160	286	0	2	108	1	1.5	2	3.0	normal	Yes	NaN
2	3	67	1	asymptomatic	120	229	0	2	129	1	2.6	2	2.0	reversable	Yes	1.0
3	4	37	1	nonanginal	130	250	0	0	187	0	3.5	3	0.0	normal	No	1.0

df1.dropna(how='any')

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD	E
2	3	67	1	asymptomatic	120	229	0	2	129	1	2.6	2	2.0	reversable	Yes	1.0
3	4	37	1	nonanginal	130	250	0	0	187	0	3.5	3	0.0	normal	No	1.0

pd.isna(df1)

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD	E
0	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	True
1	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	True
2	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False

df1.fillna(value=2)

	Unnamed: 0	Age	Sex	ChestPain	RestBP	Chol	Fbs	RestECG	MaxHR	ExAng	Oldpeak	Slope	Ca	Thal	AHD	E
0	1	63	1	typical	145	233	1	2	150	0	2.3	3	0.0	fixed	No	2.0
1	2	67	1	asymptomatic	160	286	0	2	108	1	1.5	2	3.0	normal	Yes	2.0
2	3	67	1	asymptomatic	120	229	0	2	129	1	2.6	2	2.0	reversable	Yes	1.0
3	4	37	1	nonanginal	130	250	0	0	187	0	3.5	3	0.0	normal	No	1.0

Operations#

df1.mean()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[40], line 1
----> 1 df1.mean()

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/frame.py:11335, in DataFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11327 @doc(make_doc("mean", ndim=2))
  11328 def mean(
  11329     self,
   (...)
  11333     **kwargs,
  11334 ):
> 11335     result = super().mean(axis, skipna, numeric_only, **kwargs)
  11336     if isinstance(result, Series):
  11337         result = result.__finalize__(self, method="mean")

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/generic.py:11984, in NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
  11977 def mean(
  11978     self,
  11979     axis: Axis | None = 0,
   (...)
  11982     **kwargs,
  11983 ) -> Series | float:
> 11984     return self._stat_function(
  11985         "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
  11986     )

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/generic.py:11941, in NDFrame._stat_function(self, name, func, axis, skipna, numeric_only, **kwargs)
  11937 nv.validate_func(name, (), kwargs)
  11939 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
> 11941 return self._reduce(
  11942     func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11943 )

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/frame.py:11204, in DataFrame._reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
  11200     df = df.T
  11202 # After possibly _get_data and transposing, we are now in the
  11203 #  simple case where we can use BlockManager.reduce
> 11204 res = df._mgr.reduce(blk_func)
  11205 out = df._constructor_from_mgr(res, axes=res.axes).iloc[0]
  11206 if out_dtype is not None and out.dtype != "boolean":

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/internals/managers.py:1459, in BlockManager.reduce(self, func)
   1457 res_blocks: list[Block] = []
   1458 for blk in self.blocks:
-> 1459     nbs = blk.reduce(func)
   1460     res_blocks.extend(nbs)
   1462 index = Index([None])  # placeholder

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/internals/blocks.py:377, in Block.reduce(self, func)
    371 @final
    372 def reduce(self, func) -> list[Block]:
    373     # We will apply the function and reshape the result into a single-row
    374     #  Block with the same mgr_locs; squeezing will be done at a higher level
    375     assert self.ndim == 2
--> 377     result = func(self.values)
    379     if self.values.ndim == 1:
    380         res_values = result

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/frame.py:11136, in DataFrame._reduce.<locals>.blk_func(values, axis)
  11134         return np.array([result])
  11135 else:
> 11136     return op(values, axis=axis, skipna=skipna, **kwds)

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:147, in bottleneck_switch.__call__.<locals>.f(values, axis, skipna, **kwds)
    145         result = alt(values, axis=axis, skipna=skipna, **kwds)
    146 else:
--> 147     result = alt(values, axis=axis, skipna=skipna, **kwds)
    149 return result

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:404, in _datetimelike_compat.<locals>.new_func(values, axis, skipna, mask, **kwargs)
    401 if datetimelike and mask is None:
    402     mask = isna(values)
--> 404 result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
    406 if datetimelike:
    407     result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:720, in nanmean(values, axis, skipna, mask)
    718 count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
    719 the_sum = values.sum(axis, dtype=dtype_sum)
--> 720 the_sum = _ensure_numeric(the_sum)
    722 if axis is not None and getattr(the_sum, "ndim", False):
    723     count = cast(np.ndarray, count)

File ~/.virtualenvs/.venv/lib/python3.11/site-packages/pandas/core/nanops.py:1678, in _ensure_numeric(x)
   1675 inferred = lib.infer_dtype(x)
   1676 if inferred in ["string", "mixed"]:
   1677     # GH#44008, GH#36703 avoid casting e.g. strings to numeric
-> 1678     raise TypeError(f"Could not convert {x} to numeric")
   1679 try:
   1680     x = x.astype(np.complex128)

TypeError: Could not convert ['typicalasymptomaticasymptomaticnonanginal' 'fixednormalreversablenormal'
 'NoYesYesNo'] to numeric

df1.mean(axis = 1)

# df1.sub([10 for _ in range(4)], axis = 'index')
df1.mul([10 for _ in range(4)], axis = 'index')

df1

df1.apply(np.cumsum)

df1.Age.apply(lambda x: x*365.25)

s = pd.Series(np.random.randint(0, 7, size=10))
s

s.value_counts()

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s
s.str.lower()

Merge#

df2 = pd.DataFrame(np.random.randn(10, 4))
pieces = [df2[:3], df2[3:7], df2[7:]]

pieces

pd.concat([pieces[2], pieces[1], pieces[0]])

left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left

right

pd.merge(left, right, on='key')

left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
pd.merge(left, right, on='key')

left

right

df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
# df.append(df.iloc[3])
df.append(df.iloc[3], ignore_index = True)

Grouping#

df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                    'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                    'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

df

df.groupby('A').sum()

df.groupby(['A', 'B']).sum()

Plotting#

df.groupby('A').sum()['C'].plot(kind='bar')

df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df2.plot.bar()

df2.plot.bar(stacked=True);

df2.plot.barh(stacked=True)

File I/O#

df = pd.read_csv('heart.csv')
df.head()

# df.to_csv('heart_copy.csv', index = False)

# df1 = df[:10]
# df2 = df[10:20]
# df3 = df[20:40]

# with pd.ExcelWriter('path_to_file.xlsx') as writer:
#     df1.to_excel(writer, sheet_name='Sheet1', index = False)
#     df2.to_excel(writer, sheet_name='Sheet2', index = False)
#     df3.to_excel(writer, sheet_name='Sheet3')

Using pandas

Contents

Using pandas#

Series#

DataFrame#

Viewing Data#

Data Selection#

Missing Data#

Operations#

Merge#

Grouping#

Plotting#

File I/O#