Drawer

Data Visualization using Python Matplotlib Library

Data Visualization using Python Matplotlib Library - student project

Analysis and visualization of web traffic data using Python library: Matplotlib

 

Here's the code to analyse and generate visualization of the given data:

## Downloading the data
from urllib.request import urlretrieve
urlretrieve("https://bitbucket.org/alvinwan/skillshare-data-101/raw/9b7ed1fd0b4b73fdd3f5a4a6cf67a532efb79635/views.pkl", filename="views.pkl")

## Loading the data
import pandas as pd
df = pd.read_pickle("views.pkl")
##Initial Data Exploration
df

## Data Cleaning for Sanity Checks
df["video_watched_s_trunc"] = df["video_watched_s"].clip(0, 60)
## Data after cleaning
df

## Data Analysis
df["has_clicked"] = df["has_clicked"].astype(int)
df.select_dtypes(include=['number', 'float', 'string']).mean()

## Defining the function to calculate events per day
def events_per_day(df):
datetimes = df.index
days = datetimes.floor('d')
events_per_day = days.value_counts()
return events_per_day.sort_index()

views_per_day = events_per_day(df)
print("Total Views Per Day is:\n" , views_per_day)

## Defining the function to get the click events
def get_click_events(df):
selector = df["has_clicked"] == True
clicks = df[selector]
return clicks

clicks = get_click_events(df)
## Clicks per day
clicks_per_day = events_per_day(clicks)

print("Total Clicks Per Day is:\n" , clicks_per_day)
print("Statement is:\n", clicks_per_day.values < views_per_day.values)

## Correlatoion
df.select_dtypes(include = 'number').corr()

## Splitting the two webpages data
viewsA = df[df["webpage"] == "A"]
viewsB = df[df["webpage"] == "B"]
viewsA["has_clicked"].mean(), viewsB["has_clicked"].mean()

## Defining the function to get daily stats
def get_daily_stats(df):
grouper = pd.Grouper(freq = "D")
groups = df.groupby(grouper)
daily = groups.mean()
return daily

daily_viewsA = get_daily_stats(viewsA.select_dtypes(include = 'number'))
daily_viewsB = get_daily_stats(viewsB.select_dtypes(include = 'number'))
print("Daily Views A:\n", daily_viewsA, "\n" "Daily Views B:\n", daily_viewsB)

## Plotting the data
import matplotlib.pyplot as plt
plt.title("Videos are Strongly Correlated with Clicking")
plt.bar(["Page Load", "Video Watching", "Reading Pricing"], [-0.39, 0.67, 0.004])
plt.ylabel("Correlationg with clicking")

## Plot No. 2
plt.figure().set_figwidth(10)
plt.title("Click Through Rates Over Time")
plt.plot(daily_viewsA["has_clicked"], label = "A", color = "red")
plt.plot(daily_viewsB["has_clicked"], label = "B", color = "green")
plt.legend()
plt.xlabel("Date")
plt.ylabel("Click Through Rate")

## Plot No. 3
plt.figure().set_figwidth(10)
plt.title("Webpage Page Load Times")
plt.plot(daily_viewsB["page_load_ms"], label = "B", color = "green")
plt.plot(daily_viewsA["page_load_ms"], label = "A", color = "red")
plt.legend()
plt.xlabel("Date")
plt.ylabel("Page Load TIme (ms)")

## Page Load Time Analysis
viewsA["page_load_ds"] = viewsA["page_load_ms"] // 20 * 20
page_load = viewsA.set_index("page_load_ds")
page_load = page_load.groupby(["page_load_ds"]).apply(lambda x: x.select_dtypes(include='number').mean())
page_load = page_load.sort_index()
print("Title: Page Load\n", page_load)

## Plot No. 4
plt.plot(page_load["has_clicked"])

## Numpy and Linear Regression
import numpy as np
m, b = np.polyfit(page_load.index, page_load["has_clicked"], 1)
m * 100

## Plot No. 5
plt.title("Every 100ms of Page Load Time Costs 7% CTR")
plt.plot(page_load["has_clicked"], label = "Click Through Rate", color = "red")
plt.plot(page_load.index, m * page_load.index + b, label = "Fitted CTR", color = "blue")
plt.xlabel("Page Load Time (ms)")
plt.ylabel("Click Through Rate")
plt.legend()

## Average Click Through Rates for Page Load < 550 ms
meanctrA = viewsA[viewsA["page_load_ms"] < 550]["has_clicked"].mean()
meanctrB = viewsB[viewsB["page_load_ms"] < 550]["has_clicked"].mean()

print(f"Average click-through rate for viewsA with page load < 550 ms: {meanctrA}")
print(f"Average click-through rate for viewsB with page load < 550 ms: {meanctrB}")

## Daily Clicks
clicksA = get_click_events(viewsA)
clicksB = get_click_events(viewsB)
clicksAdaily = events_per_day(clicksA)
clicksBdaily = events_per_day(clicksB)
print("Daily Click on Website A:", clicksAdaily, "Daily Click on Website B:", clicksBdaily)

## Plot No. 6
plt.figure().set_figwidth(9)
plt.title("Webpage A could have Boosted their CTR by 30%")
plt.plot(clicksAdaily, label = "A", color = "red")
plt.plot(clicksBdaily, label = "B", color = "green")
plt.plot(views_per_day * 0.7 * 0.5, label = "A (predicted)", color = "blue", linestyle = "-.")
plt.xlabel("Date")
plt.ylabel("Clicks")
plt.legend()