# Libraries import for data analysis
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.cluster import KMeans


# Data Import
# Imported the Amazon user dataset and inspected first few rows
data = pd.read_csv('C:/Users/Baljot/Desktop/Old School/Data 301/A3/Amazon_data.csv')
data.head()


data.columns

Index(['Unnamed: 0', 'base_spell', 'subspell', 'date_created_day', 'Top',
       'Left', 'asin', 'is_targeted_brand', 'search_result_amazonprime',
       'search_result_usedoptions', 'search_result_outofstock',
       'search_result_best_seller', 'search_result_sponsored',
       'search_result_sponsored_tag', 'search_result_rank',
       'search_result_resultdetail', 'search_result_stars',
       'search_result_ratings', 'search_result_newprice',
       'search_result_oldprice', 'search_result_unitprice',
       'search_result_deliveryrule', 'search_result_deliverytime',
       'search_result_price', 'search_result_stockleft',
       'search_result_coupon', 'search_result_freedelivery',
       'search_result_used_price', 'search_result_used_offers',
       'search_result_discount_subsave', 'search_result_freeshipping',
       'search_result_brand_subtitle', 'rank_full', 'rank_data_index',
       'rank_unique_page', 'amazon_brand', 'search_results_stars', 'ratings',
       'norating', 'stars', 'nostars', 'price', 'price_discount', 'noprice',
       'delivery_speed', 'min_for_freedelivery', 'delivery_fee',
       'nodeliveryfee', 'free_delivery', 'free_delivery_possible',
       'delivery_date', 'delivery_days', 'nodeliverydt', 'noinfostockleft',
       'coupon', 'n_other_offers', 'no_n_other_offers', 'new_offers',
       'used_offers', 'subsave_option', 'name_fit', 'name_cosine_distance',
       'nonamefit', 'nonamedistance', 'brand', 'major_brand',
       'has_amazon_brands', 'has_other_brands', 'amazon_brand_count',
       'major_brand_count', 'total_brand_count'],
      dtype='object')


targeted_pct = np.mean(data['is_targeted_brand'] == True) * 100
print(f"Amazon-branded products make up {targeted_pct:.2f}% of all results.")

Amazon-branded products make up 1.26% of all results.


# Separated the data based on brand
targeted_brand = data[data['is_targeted_brand'] == True]
not_targeted_brand = data[(data['is_targeted_brand'] == False) & (data['major_brand']==True)]

# Scatter plot with different colors for points based on the condition
plt.figure(figsize=(10, 6))
plt.scatter(targeted_brand['Left'], targeted_brand['Top'], marker='o', color='#00A8E1', label='Amazon Targeted Brand', alpha=0.5)
plt.scatter(not_targeted_brand['Left'], not_targeted_brand['Top'], marker='o', color='red', label='Not Targeted Brand', alpha=0.5)

plt.xlabel('Left (X-coordinate)')
plt.ylabel('Top (Y-coordinate)')
plt.title('Approximate positioning of Brands')
plt.legend()
plt.grid(True)
plt.show()


data['date_created_day'] = pd.to_datetime(data['date_created_day'])


min_date = data['date_created_day'].min()
max_date = data['date_created_day'].max()
print(min_date, max_date)

2022-06-17 00:00:00 2023-01-09 00:00:00


# Convert boolean values to int
data['amazon_brand_count'] = data['amazon_brand'].astype(int)
data['major_brand_count'] = data['major_brand'].astype(int)
data['total_brand_count'] = 1  # Every row = 1 product

# Group data by date
grouped = data.groupby('date_created_day').agg({'amazon_brand_count': 'sum', 'total_brand_count': 'sum'})
major_grouped = data.groupby('date_created_day').agg({'major_brand_count': 'sum', 'total_brand_count': 'sum'})

# Calculate daily ratios
grouped['amazon_brand_ratio'] = grouped['amazon_brand_count'] / grouped['total_brand_count']
major_grouped['major_brand_ratio'] = major_grouped['major_brand_count'] / major_grouped['total_brand_count']

# Plot
plt.figure(figsize=(12, 6))
plt.plot(grouped.index, grouped['amazon_brand_ratio'], color='#00A8E1', marker='o', label='Amazon Brand Ratio')
plt.plot(major_grouped.index, major_grouped['major_brand_ratio'], color='red', marker='o', label='Major Brand Ratio')
plt.xlabel('Date')
plt.ylabel('Brand Ratio')
plt.title('Amazon vs Major Brand Ratios Over Time')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# Calculate the average rank of Amazon's brands for each time period
amazon_average_ranks = data[data['amazon_brand']].groupby('date_created_day')['rank_full'].mean()
major_average_ranks = data[data['major_brand']].groupby('date_created_day')['rank_full'].mean()

# Visualizes the Amazon Brand Rank Average over time
plt.figure(figsize=(12, 6))
plt.plot(amazon_average_ranks.index, amazon_average_ranks.values, color='#00A8E1', marker='o')
plt.plot(major_average_ranks.index, major_average_ranks.values, color='red', marker='o')

# Adds a mean line
plt.axhline(major_average_ranks.mean(), color='red', linestyle='--', label=f'Avg Major Prevalence: {major_average_ranks.mean():.2f}')
plt.axhline(amazon_average_ranks.mean(), color='#00A8E1', linestyle='--', label=f'Avg Amazon Prevalence: {amazon_average_ranks.mean():.2f}')
plt.xlabel('Date')
plt.ylabel('Brand Rank Average')
plt.title('Brand Rank Average Over Time')
plt.grid(True)
plt.legend()
plt.show()


# Calculate the average rank by brand type for each time period
amazon_average_ranks = data[data['amazon_brand']].groupby('date_created_day')['rank_data_index'].mean()
major_average_ranks = data[data['major_brand']].groupby('date_created_day')['rank_data_index'].mean()

# Visualize the Amazon Brand Rank Average over time
plt.figure(figsize=(12, 6))
plt.plot(amazon_average_ranks.index, amazon_average_ranks.values, color='#00A8E1', marker='o')
plt.plot(major_average_ranks.index, major_average_ranks.values, color='red', marker='o')

# Adds a mean line
plt.axhline(amazon_average_ranks.mean(), color='#00A8E1', linestyle='--', label=f'Avg Amazon Prevalence: {amazon_average_ranks.mean():.2f}')
plt.axhline(major_average_ranks.mean(), color='red', linestyle='--', label=f'Avg Major Prevalence: {major_average_ranks.mean():.2f}')
plt.xlabel('Date')
plt.ylabel('Brand Result Position Average')
plt.title('Brand Result Position Average Over Time')
plt.grid(True)
plt.legend()
plt.show()


# Filters the DataFrame to select Amazon's brands and major brands
amazon_brands_df = data[data['amazon_brand']]
major_brands_df = data[data['major_brand']]

# Groups by the relevant time period (e.g., date) and counts the occurrences of Amazon's brands and major brands
amazon_brand_counts = amazon_brands_df.groupby('date_created_day').size()
major_brand_counts = major_brands_df.groupby('date_created_day').size()

# Visualize the Brand Prevalence over time
plt.figure(figsize=(12, 6))
plt.plot(amazon_brand_counts.index, amazon_brand_counts.values, label='Amazon Brands', color='#00A8E1', marker='o')
plt.plot(major_brand_counts.index, major_brand_counts.values, label='Major Brands', color='red', marker='o')
plt.axhline(amazon_brand_counts.mean(), color='#00A8E1', linestyle='--', label=f'Avg Amazon Prevalence: {amazon_brand_counts.mean():.2f}')
plt.axhline(major_brand_counts.mean(), color='red', linestyle='--', label=f'Avg Major Prevalence: {major_brand_counts.mean():.2f}')
plt.xlabel('Date')
plt.ylabel('Brand Prevalence')
plt.title('Brand Prevalence Over Time')
plt.grid(True)
plt.legend()
plt.show()

Amazon Product Placement and Brand Bias Analysis¶

Load and Preview Data¶

Feature Engineering¶

Exploratory Data Analysis (EDA)¶

Spatial Placement Distribution¶

Date Parsing¶

Dataset Time Range¶

Amazon vs Major Brand Presence Over Time¶

Average Search Rank of Amazon vs Major Brands¶

Average Search Result Position of Amazon vs Major Brands¶

Brand Prevalence Over Time¶

Summary¶

	Unnamed: 0	base_spell	subspell	date_created_day	Top	Left	asin	is_targeted_brand	search_result_amazonprime	search_result_usedoptions	...	used_offers	subsave_option	name_fit	name_cosine_distance	nonamefit	nonamedistance	brand	major_brand
0	1	1	1	8/31/2022	226.00000	283.20001	B098QR8Q4N	False	False	False	...	False	False	0.400000	0.186015	False	NaN	Warriors:	False
1	2	1	1	8/31/2022	461.05002	283.20001	B000VYX8L8	False	False	False	...	False	False	0.293103	0.119305	False	NaN	Warriors	False
2	3	1	1	8/31/2022	722.65002	283.20001	B0B6Y7SNT9	False	False	False	...	False	False	0.333333	0.303127	False	NaN	Warriors:	False
3	4	1	1	8/31/2022	1341.45010	283.20001	B09RKBVZVJ	False	False	False	...	False	False	0.363636	0.310618	False	NaN	Warriors:	False
4	5	1	1	8/31/2022	1571.65000	283.20001	B09N8T1DWR	False	False	False	...	False	False	0.363636	0.170995	False	NaN	Warriors	False