# Using pandas

In [None]:
import pandas as pd
import numpy as np # for generating random numbers

## Series

In [None]:
# series
s1 = pd.Series([10, 11, 12], dtype = float)
s1

In [None]:
s2 = pd.Series([10, 11, 12], dtype = int)
s2

In [None]:
s3 = pd.Series(["10", "11", "12"], name = 'Val', index = pd.Index(['a', 'b', 'c'], name = 'label'))
s3

In [None]:
s1.index

In [None]:
# s1.append(s2)
s4 = pd.concat([s1, s3])
s4

In [None]:
print(s4[0])
print(s4)
s4_reset = s4.reset_index(drop=True)
print(s4_reset)
s4_reset[0]

In [None]:
dates = pd.date_range('20190503', periods=7)

dates

## DataFrame

In [None]:
df = pd.DataFrame(np.random.randn(7, 4), index=dates, columns=list('ABCD'))
df

In [None]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(2, 6)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})
df2

In [None]:
df2.dtypes

In [None]:
df = pd.read_csv('heart.csv')
df

## Viewing Data

In [None]:
df.head(10)

In [None]:
df.tail(8)

In [None]:
df.describe()

In [None]:
df.index

In [None]:
df.columns

In [None]:
# df.sort_index(axis=0, ascending=True)
# df.sort_index(axis=0, ascending=False)

df.sort_index(axis=1, ascending=True)
df.sort_index(axis=1, ascending=False)

In [None]:
df.sort_values(by = 'Sex', ascending = False)

In [None]:
df.T

In [None]:
df['Age']

In [None]:
df[['Age', 'Sex']]

In [None]:
df[['Age', 'Sex']][0:2]

In [None]:
df[0:2][['Age']]

## Data Selection

In [None]:
df.loc[4]
# df

In [None]:
df.loc[[3,5], ['Age', 'Fbs']]

In [None]:
df.iloc[5]

In [None]:
df.iloc[5, 3]

In [None]:
# df.iloc[5:7, 3:7]
# df.iloc[[5, 7, 9], 3:7]
# df.iloc[:, 3]
df.iloc[3: 5, :]
# df.loc[5:7, 3:7]

In [None]:
df.iloc[1, 1]
df.at[1, 'Age'] # Get scalar values. It's a very fast loc
df.iat[1, 1] # Get scalar values. It's a very fast iloc

In [None]:
df['Age'] > 50

In [None]:
df[df.Age > 50]

In [None]:
df2 = df.copy()
df2 = df2.iloc[0: 5, :]
df2

In [None]:
# dfa['A'] = list(range(len(dfa.index)))  # use this form to create a new column
df2['cp'] = ['three', 'two', 'one', 'one', 'zero']
df2
df2.cp.isin(['one', 'zero'])

In [None]:
df2[df2.cp.isin(['one', 'zero'])]

## Missing Data

In [None]:
df1 = df.reindex(index=range(0, 4), columns=list(df.columns) + ['E'])
df1.loc[2:3, 'E'] = 1
df1

In [None]:
df1.dropna(how='any')

In [None]:
pd.isna(df1)

In [None]:
df1.fillna(value=2)

## Operations

In [None]:
df1.mean()

In [None]:
df1.mean(axis = 1)

In [None]:
# df1.sub([10 for _ in range(4)], axis = 'index')
df1.mul([10 for _ in range(4)], axis = 'index')

In [None]:
df1

In [None]:
df1.apply(np.cumsum)

In [None]:
df1.Age.apply(lambda x: x*365.25)

In [None]:
s = pd.Series(np.random.randint(0, 7, size=10))
s

In [None]:
s.value_counts()

In [None]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s
s.str.lower()

## Merge

In [None]:
df2 = pd.DataFrame(np.random.randn(10, 4))
pieces = [df2[:3], df2[3:7], df2[7:]]

pieces

pd.concat([pieces[2], pieces[1], pieces[0]])

In [None]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left


In [None]:
right

In [None]:
pd.merge(left, right, on='key')

In [None]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
pd.merge(left, right, on='key')

In [None]:
left

In [None]:
right

In [None]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
# df.append(df.iloc[3])
df.append(df.iloc[3], ignore_index = True)

## Grouping

In [None]:
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                    'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three',
                    'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8)})

df

In [None]:
df.groupby('A').sum()

In [None]:
df.groupby(['A', 'B']).sum()

## Plotting

In [None]:
df.groupby('A').sum()['C'].plot(kind='bar')

In [None]:
df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd'])
df2.plot.bar()

In [None]:
df2.plot.bar(stacked=True);

In [None]:
df2.plot.barh(stacked=True)

## File I/O

In [None]:
df = pd.read_csv('heart.csv')
df.head()

In [None]:
# df.to_csv('heart_copy.csv', index = False)

In [None]:
# df1 = df[:10]
# df2 = df[10:20]
# df3 = df[20:40]

In [None]:
# with pd.ExcelWriter('path_to_file.xlsx') as writer:
#     df1.to_excel(writer, sheet_name='Sheet1', index = False)
#     df2.to_excel(writer, sheet_name='Sheet2', index = False)
#     df3.to_excel(writer, sheet_name='Sheet3')