Pandas is a data manipulation and analysis tool that is built on Numpy.
Pandas uses a data structure known as DataFrame (think of it as Microsoft excel in Python).
DataFrames empower programmers to store and manipulate data in a tabular fashion (rows and columns).
Series Vs. DataFrame? Series is considered a single column of a DataFrame.
import pandas as pd
# Let's define two lists as shown below:
my_list=['AAPL','AMZN','T']
my_list //['AAPL', 'AMZN', 'T']
label=['stock#1','stock#2','stock#3']
label //['stock#1', 'stock#2', 'stock#3']
# Let's create a one dimensional Pandas "series"
# Note that series is formed of data and associated labels
x_series=pd.Series(data=my_list,index=label)
# Let's view the series
x_series
***********************************
stock#1 AAPL
stock#2 AMZN
stock#3 T
dtype: object
***********************************
# Let's obtain the datatype
type(x_series) //pandas.core.series.Series
# Let's define a two-dimensional Pandas DataFrame
# Note that you can create a pandas dataframe from a python dictionary
bank_client_df = pd.DataFrame({'Bank client ID':[111,222,333,444],
'Bank client Name':['Chanel','Steve','Mitch','Ryan'],
'Net Worth [$]':[3500,29000,10000,2000],
'Years with bank':[3,4,9,5]})
bank_client_df
**********************************************
Bank client ID Bank client Name Net Worth [$] Years with bank
0 111 Chanel 3500 3
1 222 Steve 29000 4
2 333 Mitch 10000 9
3 444 Ryan 2000 5
**************************************************
# you can only view the first couple of rows using .head()
bank_client_df.head(2)
# you can only view the last couple of rows using .tail()
bank_client_df.tail(1)
PANDAS WITH CSV AND HTML DATA
# In order to access data on Google Drive, you need to mount the drive to access it's content
from google.colab import drive
drive.mount('/content/drive')
# Pandas is used to read a csv file and store data in a DataFrame
bank_df = pd.read_csv('/content/drive/My Drive/foldername/bank_client_information.csv')
# write to a csv file without an index
bank_df.to_csv('sample_output_fajar.csv',index = False)
# write to a csv file with an index
# write to a csv file without an index
bank_df.to_csv('sample_output_2.csv',index = True)
Read Html
# Read tabular data using read_html
house_prices_df = pd.read_html('https://www.livingin-canada.com/house-prices-canada.html')
house_prices_df[0]
PANDAS OPERATIONS
# Let's define a dataframe as follows:
bank_client_df = pd.DataFrame({'Bank client ID':[111,222,333,444],
'Bank client Name':['Chanel','Steve','Mitch','Ryan'],
'Net Worth [$]':[3500,29000,10000,2000],
'Years with bank':[3,4,9,5]})
bank_client_df
Bank client ID
Bank Client Name
Net worth [$]
Years with bank
0
111
Chanel
3500
3
1
222
Steve
29000
4
2
333
Mitch
10000
9
3
444
Ryan
2000
5
# Pick certain rows that satisfy a certain criteria
df_loyal = bank_client_df[ (bank_client_df['Years with bank'] >= 5) ]
df_loyal
Bank client ID
Bank Client Name
Net worth [$]
Years with bank
2
333
Mitch
10000
9
3
444
Ryan
2000
5
# Delete a column from a DataFrame
del bank_client_df['Bank client ID']
bank_client_df
# Define a function that increases all clients networth (stocks) by a fixed value of 10% (for simplicity sake)
def networth_update(balance):
return balance * 1.1 # assume that stock prices increased by 10%
# You can apply a function to the DataFrame
bank_client_df['Net worth [$]'].apply(networth_update)
bank_client_df['Years with bank'].sum() //21
# A histogram represents data using bars of various heights.
# Each bar groups numbers into specific ranges.
# Taller bars show that more data falls within that specific range.
mu = daily_return_df['AAPL'].mean() # mean of distribution
sigma = daily_return_df['AAPL'].std() # standard deviation of distribution
num_bins = 40
# Plot the histogram of the returns
plt.figure(figsize = (7, 5))
plt.hist(daily_return_df['AAPL'], num_bins, facecolor = 'blue');
plt.grid()
plt.ylabel('Probability')
plt.title('Histogram: mu=' + str(mu) + ', sigma=' +str(sigma));