In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
print('hi')
hi
In [3]:
data = pd.read_csv('C:/Users/Baljot/Desktop/Data 301/A3/Amazon_data.csv')
data.head()
Out[3]:
Unnamed: 0 | base_spell | subspell | date_created_day | Top | Left | asin | is_targeted_brand | search_result_amazonprime | search_result_usedoptions | ... | used_offers | subsave_option | name_fit | name_cosine_distance | nonamefit | nonamedistance | brand | major_brand | has_amazon_brands | has_other_brands | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 1 | 8/31/2022 | 226.00000 | 283.20001 | B098QR8Q4N | False | False | False | ... | False | False | 0.400000 | 0.186015 | False | NaN | Warriors: | False | 0 | 0 |
1 | 2 | 1 | 1 | 8/31/2022 | 461.05002 | 283.20001 | B000VYX8L8 | False | False | False | ... | False | False | 0.293103 | 0.119305 | False | NaN | Warriors | False | 0 | 0 |
2 | 3 | 1 | 1 | 8/31/2022 | 722.65002 | 283.20001 | B0B6Y7SNT9 | False | False | False | ... | False | False | 0.333333 | 0.303127 | False | NaN | Warriors: | False | 0 | 0 |
3 | 4 | 1 | 1 | 8/31/2022 | 1341.45010 | 283.20001 | B09RKBVZVJ | False | False | False | ... | False | False | 0.363636 | 0.310618 | False | NaN | Warriors: | False | 0 | 0 |
4 | 5 | 1 | 1 | 8/31/2022 | 1571.65000 | 283.20001 | B09N8T1DWR | False | False | False | ... | False | False | 0.363636 | 0.170995 | False | NaN | Warriors | False | 0 | 0 |
5 rows × 68 columns
In [4]:
data.iloc[:,7]
Out[4]:
0 False 1 False 2 False 3 False 4 False ... 228276 False 228277 False 228278 False 228279 False 228280 False Name: is_targeted_brand, Length: 228281, dtype: bool
In [5]:
np.mean(data['is_targeted_brand']==True)*100
Out[5]:
1.2616030243428056
In [53]:
data.Top.max()
Out[53]:
18943.771
In [72]:
data.Left.max()
Out[72]:
2919.0156
In [26]:
# Separate the data based on the condition
targeted_brand = data[data['is_targeted_brand'] == True]
not_targeted_brand = data[(data['is_targeted_brand'] == False) & (data['major_brand']==True)]
# Scatter plot with different colors for points based on the condition
plt.figure(figsize=(10, 6))
plt.scatter(targeted_brand['Left'], targeted_brand['Top'], marker='o', color='#00A8E1', label='Amazon Targeted Brand', alpha=0.5)
plt.scatter(not_targeted_brand['Left'], not_targeted_brand['Top'], marker='o', color='red', label='Not Targeted Brand', alpha=0.5)
plt.xlabel('Left (X-coordinate)')
plt.ylabel('Top (Y-coordinate)')
plt.title('Approximate positioning of Brands')
plt.legend()
plt.grid(True)
plt.show()
In [8]:
np.mean(data['major_brand']==True)*100
Out[8]:
3.9162260547307923
In [9]:
a = (data['major_brand']==True)
b = (data['is_targeted_brand']==False)
np.mean(a & b)
Out[9]:
0.039162260547307925
In [10]:
data['date_created_day'] = pd.to_datetime(data['date_created_day'])
In [11]:
min_date = data['date_created_day'].min()
max_date = data['date_created_day'].max()
print(min_date, max_date)
2022-06-17 00:00:00 2023-01-09 00:00:00
In [ ]:
In [12]:
# DataFrame 'data' with columns 'brand' and 'amazon_brand'
# 'brand' contains the brand names, and 'amazon_brand' is True if it's Amazon's brand, False otherwise
# Calculates the count of Amazon's brands and total brands for each search result or time period
data['amazon_brand_count'] = data['amazon_brand'].astype(int)
data['major_brand_count'] = data['major_brand'].astype(int)
data['total_brand_count'] = 1
# Groups by the relevant time period (e.g., date) and calculates the sum of counts
grouped = data.groupby('date_created_day').agg({'amazon_brand_count': 'sum', 'total_brand_count': 'sum'})
major_grouped = data.groupby('date_created_day').agg({'major_brand_count': 'sum', 'total_brand_count': 'sum'})
# Calculates the Amazon Brand Ratio
grouped['amazon_brand_ratio'] = grouped['amazon_brand_count'] / grouped['total_brand_count']
major_grouped['major_brand_ratio'] = major_grouped['major_brand_count'] / grouped['total_brand_count']
# Visualize the Amazon Brand Ratio over time
plt.figure(figsize=(12, 6))
plt.plot(grouped.index, grouped['amazon_brand_ratio'], color='blue', marker='o')
plt.plot(grouped.index, major_grouped['major_brand_ratio'], color='red', marker='o')
plt.xlabel('Date')
plt.ylabel('Amazon Brand Ratio')
plt.title('Amazon Brand Ratio Over Time')
plt.grid(True)
plt.show()
In [22]:
# Assuming you have a DataFrame 'df' with columns 'brand', 'rank', and 'amazon_brand'
# 'brand' contains the brand names, 'rank' contains the rank, and 'amazon_brand' is True if it's Amazon's brand, False otherwise
# Calculate the average rank of Amazon's brands for each time period
amazon_average_ranks = data[data['amazon_brand']].groupby('date_created_day')['rank_full'].mean()
major_average_ranks = data[data['major_brand']].groupby('date_created_day')['rank_full'].mean()
# Visualizes the Amazon Brand Rank Average over time
plt.figure(figsize=(12, 6))
plt.plot(amazon_average_ranks.index, amazon_average_ranks.values, color='#00A8E1', marker='o')
plt.plot(major_average_ranks.index, major_average_ranks.values, color='red', marker='o')
# Adds a mean line
plt.axhline(major_average_ranks.mean(), color='red', linestyle='--', label=f'Avg Amazon Prevalence: {major_average_ranks.mean():.2f}')
plt.axhline(amazon_average_ranks.mean(), color='#00A8E1', linestyle='--', label=f'Avg Major Prevalence: {amazon_average_ranks.mean():.2f}')
plt.xlabel('Date')
plt.ylabel('Amazon Brand Rank Average')
plt.title('Amazon Brand Rank Average Over Time')
plt.grid(True)
plt.legend()
plt.show()
In [24]:
# Assuming you have a DataFrame 'df' with columns 'brand', 'rank', and 'amazon_brand'
# 'brand' contains the brand names, 'rank' contains the rank, and 'amazon_brand' is True if it's Amazon's brand, False otherwise
# Calculate the average rank of Amazon's brands for each time period
amazon_average_ranks = data[data['amazon_brand']].groupby('date_created_day')['rank_data_index'].mean()
major_average_ranks = data[data['major_brand']].groupby('date_created_day')['rank_data_index'].mean()
# Visualize the Amazon Brand Rank Average over time
plt.figure(figsize=(12, 6))
plt.plot(amazon_average_ranks.index, amazon_average_ranks.values, color='#00A8E1', marker='o')
plt.plot(major_average_ranks.index, major_average_ranks.values, color='red', marker='o')
# Adds a mean line
plt.axhline(major_average_ranks.mean(), color='#00A8E1', linestyle='--', label=f'Avg Amazon Prevalence: {major_average_ranks.mean():.2f}')
plt.axhline(amazon_average_ranks.mean(), color='red', linestyle='--', label=f'Avg Major Prevalence: {amazon_average_ranks.mean():.2f}')
plt.xlabel('Date')
plt.ylabel('Brand Rank Average')
plt.title('Brand Rank Average Over Time')
plt.grid(True)
plt.legend()
plt.show()
In [17]:
# Filters the DataFrame to select Amazon's brands and major brands
amazon_brands_df = data[data['amazon_brand']]
major_brands_df = data[data['major_brand']]
# Groups by the relevant time period (e.g., date) and counts the occurrences of Amazon's brands and major brands
amazon_brand_counts = amazon_brands_df.groupby('date_created_day').size()
major_brand_counts = major_brands_df.groupby('date_created_day').size()
# Visualize the Brand Prevalence over time
plt.figure(figsize=(12, 6))
plt.plot(amazon_brand_counts.index, amazon_brand_counts.values, label='Amazon Brands', color='blue', marker='o')
plt.plot(major_brand_counts.index, major_brand_counts.values, label='Major Brands', color='red', marker='o')
plt.axhline(amazon_brand_counts.mean(), color='blue', linestyle='--', label=f'Avg Amazon Prevalence: {amazon_brand_counts.mean():.2f}')
plt.axhline(major_brand_counts.mean(), color='red', linestyle='--', label=f'Avg Major Prevalence: {major_brand_counts.mean():.2f}')
plt.xlabel('Date')
plt.ylabel('Brand Prevalence')
plt.title('Brand Prevalence Over Time')
plt.grid(True)
plt.legend()
plt.show()
In [29]:
np.mean([data['rank_data_index']==1])
Out[29]:
0.018087357248303626
In [30]:
data.columns
Out[30]:
Index(['Unnamed: 0', 'base_spell', 'subspell', 'date_created_day', 'Top', 'Left', 'asin', 'is_targeted_brand', 'search_result_amazonprime', 'search_result_usedoptions', 'search_result_outofstock', 'search_result_best_seller', 'search_result_sponsored', 'search_result_sponsored_tag', 'search_result_rank', 'search_result_resultdetail', 'search_result_stars', 'search_result_ratings', 'search_result_newprice', 'search_result_oldprice', 'search_result_unitprice', 'search_result_deliveryrule', 'search_result_deliverytime', 'search_result_price', 'search_result_stockleft', 'search_result_coupon', 'search_result_freedelivery', 'search_result_used_price', 'search_result_used_offers', 'search_result_discount_subsave', 'search_result_freeshipping', 'search_result_brand_subtitle', 'rank_full', 'rank_data_index', 'rank_unique_page', 'amazon_brand', 'search_results_stars', 'ratings', 'norating', 'stars', 'nostars', 'price', 'price_discount', 'noprice', 'delivery_speed', 'min_for_freedelivery', 'delivery_fee', 'nodeliveryfee', 'free_delivery', 'free_delivery_possible', 'delivery_date', 'delivery_days', 'nodeliverydt', 'noinfostockleft', 'coupon', 'n_other_offers', 'no_n_other_offers', 'new_offers', 'used_offers', 'subsave_option', 'name_fit', 'name_cosine_distance', 'nonamefit', 'nonamedistance', 'brand', 'major_brand', 'has_amazon_brands', 'has_other_brands', 'amazon_brand_count', 'major_brand_count', 'total_brand_count'], dtype='object')
In [40]:
len(data[data['is_targeted_brand']==True])
Out[40]:
2880
In [39]:
a = data['is_targeted_brand']==True
b = data['search_result_amazonprime']==True
len(data[a&b])
Out[39]:
1997
In [38]:
1997/2880
Out[38]:
0.6934027777777778
In [49]:
len(data[data['major_brand']==True])
Out[49]:
8940
In [41]:
len(data[data['major_brand']==True])
Out[41]:
225401
In [42]:
a = data['is_targeted_brand']==False
b = data['search_result_amazonprime']==True
len(data[a&b])
Out[42]:
139161
In [43]:
139161/225401
Out[43]:
0.6173930018056708
In [44]:
a = data['is_targeted_brand']==True
b = data['search_result_sponsored']==True
len(data[a&b])
Out[44]:
718
In [46]:
718/2880
Out[46]:
0.24930555555555556
In [45]:
a = data['is_targeted_brand']==False
b = data['search_result_sponsored']==True
len(data[a&b])
Out[45]:
51043
In [47]:
51043/225401
Out[47]:
0.226454186095004
In [ ]:
a = data['is_targeted_brand']==True
b = data['search_result_deliveryrule']=='Same'
len(data[a&b])
In [ ]:
In [ ]:
In [ ]: