Data Visualization using Python

Pandas Basic

  • Pandas is a data manipulation and analysis tool that is built on Numpy.
  • Pandas uses a data structure known as DataFrame (think of it as Microsoft excel in Python). 
  • DataFrames empower programmers to store and manipulate data in a tabular fashion (rows and columns).
  • Series Vs. DataFrame? Series is considered a single column of a DataFrame.
import pandas as pd
# Let's define two lists as shown below:
my_list=['AAPL','AMZN','T']
my_list //['AAPL', 'AMZN', 'T']
label=['stock#1','stock#2','stock#3']
label //['stock#1', 'stock#2', 'stock#3']

# Let's create a one dimensional Pandas "series" 
# Note that series is formed of data and associated labels 
x_series=pd.Series(data=my_list,index=label)

# Let's view the series
x_series
***********************************
stock#1    AAPL
stock#2    AMZN
stock#3       T
dtype: object
***********************************
# Let's obtain the datatype
type(x_series) //pandas.core.series.Series
# Let's define a two-dimensional Pandas DataFrame
# Note that you can create a pandas dataframe from a python dictionary
bank_client_df = pd.DataFrame({'Bank client ID':[111,222,333,444],
                               'Bank client Name':['Chanel','Steve','Mitch','Ryan'],
                               'Net Worth [$]':[3500,29000,10000,2000],
                               'Years with bank':[3,4,9,5]})
bank_client_df

**********************************************

Bank client ID	Bank client Name	Net Worth [$]	Years with bank
0	111	Chanel	3500	3
1	222	Steve	29000	4
2	333	Mitch	10000	9
3	444	Ryan	2000	5
**************************************************
# you can only view the first couple of rows using .head()
bank_client_df.head(2)

# you can only view the last couple of rows using .tail()
bank_client_df.tail(1)
PANDAS WITH CSV AND HTML DATA
# In order to access data on Google Drive, you need to mount the drive to access it's content
from google.colab import drive
drive.mount('/content/drive')
# Pandas is used to read a csv file and store data in a DataFrame
bank_df = pd.read_csv('/content/drive/My Drive/foldername/bank_client_information.csv')
# write to a csv file without an index
bank_df.to_csv('sample_output_fajar.csv',index = False)

# write to a csv file with an index
# write to a csv file without an index
bank_df.to_csv('sample_output_2.csv',index = True)
Read Html
# Read tabular data using read_html
house_prices_df = pd.read_html('https://www.livingin-canada.com/house-prices-canada.html')

house_prices_df[0]
PANDAS OPERATIONS
# Let's define a dataframe as follows:
bank_client_df = pd.DataFrame({'Bank client ID':[111,222,333,444],
                               'Bank client Name':['Chanel','Steve','Mitch','Ryan'],
                               'Net Worth [$]':[3500,29000,10000,2000],
                               'Years with bank':[3,4,9,5]})
bank_client_df
Bank client IDBank Client NameNet worth [$]Years with bank
0111Chanel35003
1222Steve290004
2333Mitch100009
3444Ryan20005
# Pick certain rows that satisfy a certain criteria 
df_loyal = bank_client_df[ (bank_client_df['Years with bank'] >= 5) ]
df_loyal
Bank client IDBank Client NameNet worth [$]Years with bank
2333Mitch100009
3444Ryan20005
# Delete a column from a DataFrame
del bank_client_df['Bank client ID']
bank_client_df
PANDAS WITH FUNCTIONS
# Let's define a dataframe as follows:
bank_client_df = pd.DataFrame({'Bank client ID':[111, 222, 333, 444], 
                               'Bank Client Name':['Chanel', 'Steve', 'Mitch', 'Ryan'], 
                               'Net worth [$]':[3500, 29000, 10000, 2000], 
                               'Years with bank':[3, 4, 9, 5]})
bank_client_df
# Define a function that increases all clients networth (stocks) by a fixed value of 10% (for simplicity sake) 
def networth_update(balance):
    return balance * 1.1 # assume that stock prices increased by 10%

# You can apply a function to the DataFrame 
bank_client_df['Net worth [$]'].apply(networth_update)

bank_client_df['Years with bank'].sum() //21
SORTING AND ORDERING
# Let's define a dataframe as follows:
bank_client_df = pd.DataFrame({'Bank client ID':[111, 222, 333, 444], 
                               'Bank Client Name':['Chanel', 'Steve', 'Mitch', 'Ryan'], 
                               'Net worth [$]':[3500, 29000, 10000, 2000], 
                               'Years with bank':[3, 4, 9, 5]})
bank_client_df
# You can sort the values in the dataframe according to number of years with bank
bank_client_df.sort_values(by = 'Years with bank') 
# Set inplace = True to ensure that change has taken place in memory 
bank_client_df.sort_values(by = 'Years with bank', inplace = True) 
CONCATENATING AND MERGING WITH PANDAS
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                    'B': ['B4', 'B5', 'B6', 'B7'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7]) 

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                    'B': ['B8', 'B9', 'B10', 'B11'],
                    'C': ['C8', 'C9', 'C10', 'C11'],
                    'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])

pd.concat([df1, df2, df3])
# Creating a dataframe from a dictionary
# Let's define a dataframe with a list of bank clients with IDs = 1, 2, 3, 4, 5 

raw_data = {'Bank Client ID': ['1', '2', '3', '4', '5'],
            'First Name': ['Nancy', 'Alex', 'Shep', 'Max', 'Allen'], 
            'Last Name': ['Rob', 'Ali', 'George', 'Mitch', 'Steve']}

Bank_df_1 = pd.DataFrame(raw_data, columns = ['Bank Client ID', 'First Name', 'Last Name'])
Bank_df_1
# Let's define another dataframe for a separate list of clients (IDs = 6, 7, 8, 9, 10)
raw_data = {
        'Bank Client ID': ['6', '7', '8', '9', '10'],
        'First Name': ['Bill', 'Dina', 'Sarah', 'Heather', 'Holly'], 
        'Last Name': ['Christian', 'Mo', 'Steve', 'Bob', 'Michelle']}
Bank_df_2 = pd.DataFrame(raw_data, columns = ['Bank Client ID', 'First Name', 'Last Name'])
Bank_df_2
# Let's assume we obtained additional information (Annual Salary) about our bank customers 
# Note that data obtained is for all clients with IDs 1 to 10
 
raw_data = {
        'Bank Client ID': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'],
        'Annual Salary [$/year]': [25000, 35000, 45000, 48000, 49000, 32000, 33000, 34000, 23000, 22000]}
bank_df_salary = pd.DataFrame(raw_data, columns = ['Bank Client ID','Annual Salary [$/year]'])
# Let's concatenate both dataframes #1 and #2
# Note that we now have client IDs from 1 to 10
bank_df_all = pd.concat([Bank_df_1, Bank_df_2])
# Let's merge all data on 'Bank Client ID'
bank_df_all = pd.merge(bank_df_all, bank_df_salary, on = 'Bank Client ID')

Matplotlib

BASIC LINE PLOT
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
stock_df = pd.read_csv('/content/drive/My Drive/foldername/stocks.csv')

stock_df
DateAAPLBATMGMAMZNIBMTSLAGOOGsp500
02012-01-1260.19857075.51000230.12000112.130000175.929993180.55000328.250000313.6443791295.500000
12012-01-1359.97285874.59999830.07000012.350000178.419998179.16000422.790001311.3280641289.089966
22012-01-1760.67142975.23999830.25000012.250000181.660004180.00000026.600000313.1163641293.670044
32012-01-1861.30143075.05999830.33000012.730000189.440002181.07000726.809999315.2732851308.040039
42012-01-1961.10714375.55999830.42000012.800000194.449997180.52000426.760000318.5908511314.500000
stock_df.plot(x = 'Date', y = 'AAPL', label = 'APPLE Stock Price', linewidth = 3);
plt.ylabel('Price')
plt.title('My first plotting exercise!')
plt.legend(loc="upper left")
plt.grid()
SCATTERPLOT
daily_return_df = pd.read_csv('/content/drive/My Drive/foldername/daily_returns.csv')

daily_return_df
AAPLsp500GOOG
00.0000000.0000000.000000
1-0.374946-0.494792-0.738516
21.1648120.3552950.574410
31.0383821.1107930.688856
4-0.3169370.4938661.052283
import random 

X = daily_return_df['AAPL']
Y = daily_return_df['sp500']

plt.scatter(X, Y);
Scatter plot
PIE CHART
values  = [20, 55, 5, 17, 3]
colors  = ['g', 'r', 'y', 'b', 'm']
explode = [0, 0.2, 0, 0, 0.2]
labels  = ['AAPL', 'GOOG', 'T', 'TSLA', 'AMZN']

# Use matplotlib to plot a pie chart 
plt.figure(figsize = (7, 7))
plt.pie(values, colors = colors, labels = labels, explode = explode)
plt.title('STOCK PORTFOLIO')
plt.show()
values = [20, 20, 20, 20, 20]
colors = ['g', 'r', 'y', 'b', 'm']
explode = [0, 0.2, 0, 0, 0.2]
labels = ['AAPL', 'GOOG', 'T', 'TSLA  ', 'AMZN']
plt.pie(values, colors = colors, labels = labels, explode = explode)
plt.title('STOCK PORTFOLIO')
plt.show()
HISTOGRAMS
# A histogram represents data using bars of various heights. 
# Each bar groups numbers into specific ranges. 
# Taller bars show that more data falls within that specific range.


mu = daily_return_df['AAPL'].mean() # mean of distribution
sigma = daily_return_df['AAPL'].std() # standard deviation of distribution
 
num_bins = 40

# Plot the histogram of the returns
plt.figure(figsize = (7, 5)) 
plt.hist(daily_return_df['AAPL'], num_bins, facecolor = 'blue');
plt.grid()
plt.ylabel('Probability')
plt.title('Histogram: mu=' + str(mu) + ', sigma=' +str(sigma));
MULTIPLE PLOTS
stock_df.plot(x = 'Date', y = ['AAPL', 'sp500'], linewidth = 3)
plt.ylabel('Price')
plt.title('Stock Prices')
plt.grid()
SUBPLOTS
plt.figure(figsize = (8, 5))

plt.subplot(1, 2, 1)
plt.plot(stock_df['AAPL'], 'r--');
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(stock_df['sp500'], 'b.');
plt.grid()
plt.figure(figsize = (8, 8))

plt.subplot(2, 1, 1)
plt.plot(stock_df['AAPL'], 'r--');
plt.grid()

plt.subplot(2, 1, 2)
plt.plot(stock_df['sp500'], 'b.');
plt.grid()
3D PLOTS
# Toolkits are collections of application-specific functions that extend Matplotlib.
# mpl_toolkits.mplot3d provides tools for basic 3D plotting.
# https://matplotlib.org/mpl_toolkits/index.html

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111, projection='3d')

x =[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y =[5, 6, 2, 3, 13, 4, 1, 2, 4, 8]
z =[2, 3, 3, 3, 5, 7, 9, 11, 9, 10]

ax.scatter(x, y, z, c = 'r', marker = 'o')

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')
BOXPLOTS
# numpy.random.normal() takes three arguments: mean, standard deviation of the normal distribution, and number of values desired.
# Great resource: https://stackoverflow.com/questions/17725927/boxplots-in-matplotlib-markers-and-outliers

np.random.seed(20)

data_1 = np.random.normal(200, 20, 2000)
data_2 = np.random.normal(60, 30, 2000)
data_3 = np.random.normal(70, 20, 2000)
data_4 = np.random.normal(40, 5, 2000)

data_all = [data_1, data_2, data_3, data_4]

fig = plt.figure(figsize = (10, 7))
ax = fig.add_subplot(111)
bp = ax.boxplot(data_all)

Leave a Reply

Your email address will not be published. Required fields are marked *