import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


df = pd.read_csv("US_youtube_trending_data.csv")
df.head()


cat_id = {
    1 :"Film & Animation",
    2 : "Autos & Vehicles",
    10: "Music",
    15: "Pets & Animals",
    17: "Sports",
    18: "Short Movies",
    19: "Travel & Events",
    20: "Gaming",
    21: "Videoblogging",
    22: "People & Blogs",
    23: "Comedy",
    24: "Entertainment",
    25: "News & Politics",
    26: "Howto & Style",
    27: "Education",
    28: "Science & Technology",
    29: "Nonprofits & Activism",
    30: "Movies",
    31: "Anime/Animation",
    32: "Action/Adventure",
    33: "Classics",
    34: "Comedy",
    35: "Documentary",
    36: "Drama",
    37: "Family",
    38: "Foreign",
    39: "Horror",
    40: "Sci-Fi/Fantasy",
    41: "Thriller",
    42: "Shorts",
    43: "Shows",
    44: "Trailers"
}


df_c = df['categoryId'].tolist()
cat_names = [None] * len(df_c)
for i in range(len(df_c)):
    cat_names[i] = cat_id[df_c[i]]
df.drop(columns = ['categoryId'], inplace= True)
df['category'] = cat_names


df.drop(columns = ['video_id', 'channelId', 'description','thumbnail_link'], inplace = True)
df


nan_likes = df['likes'].isnull().values.any()
nan_dislikes =  df['dislikes'].isnull().values.any()
nan_views =  df['view_count'].isnull().values.any()

print(nan_likes)
print(nan_dislikes)
print(nan_views)

False
False
False


### Lets find out more about the viewcounts and like to dislike videos

likes_to_dislike_ratio = [None]* len(df)
likes_count = df['likes']
dislikes_count = df['dislikes']
i = 0
for l, d in zip(likes_count, dislikes_count):
    if l+d == 0 or d == 0:
        likes_to_dislike_ratio[i] = np.nan
        i += 1
        continue
    likes_to_dislike_ratio[i] = (l/(l+d)) * 100
    i += 1
df['likes_ratio'] = likes_to_dislike_ratio
df = df.dropna()
df.head()


count_zero = (df['dislikes'] == 0).sum()
count_ratio = (df['likes_ratio'] == 0).sum()
print(count_zero)
print(count_ratio)

0
0


plt.scatter(df['likes_ratio'], df['view_count'])
plt.xlabel(r"Like Ratio (%)")
plt.ylabel("Views")
plt.title("Like Ratio vs Views")
plt.show()


sns.regplot(x='view_count',y='likes',data=df)
plt.show()


views = df['view_count']
likes = df['likes']

views_log= []
likes_log = []
for i,v in enumerate(zip(views, likes)):
    if v[0] == 0 or v[1] == 0:
        continue
    views_log.append(math.log(v[0],10))
    likes_log.append(math.log(v[1],10))


log_df = pd.DataFrame()
log_df['likes_log'] = likes_log
log_df['views_log'] = views_log
sns.regplot(x='views_log',y='likes_log',data=log_df)
plt.show()


log_df = pd.DataFrame()
log_df['likes_log'] = likes_log[0:len(likes_log)//100]
log_df['views_log'] = views_log[0:len(likes_log)//100]
sns.regplot(x='views_log',y='likes_log',data=log_df)
plt.show()


plt.hist(df['likes_ratio'])
plt.xlabel(r"Like Ratio (%)")
plt.ylabel("Number of Videos")
plt.title("What percent of popular video has a high like ratio")
plt.show()


temp = {}
for c in cat_id.values():
    temp[c] = 0

for data in df['category']:
    temp[data] += 1

categories = {}
for c in temp:
    if temp[c] > 100:
        categories[c] = temp[c]
fig = plt.figure(figsize =(25, 15))    
plt.bar(categories.keys(), categories.values())
plt.xlabel(r"Categories")
plt.ylabel("Number of Videos")
#plt.title("")
plt.show()


# taking a substring of the first ten characters of each trending data
# effectively removing the extra characters at the end
for i, row in df.iterrows():
  df.at[i, "trending_date"] = row["trending_date"][0:10]

# converting trending_date col to datetime format
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%Y.%m.%d')
df.head()

C:\Users\Steve\AppData\Local\Temp\ipykernel_13156\156249899.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['trending_date'] = pd.to_datetime(df['trending_date'], format='%Y.%m.%d')


byCat = df.groupby(by=['category']).sum().reset_index()
display(byCat)


# getting each category's data as a separate df
grouped = df.groupby(df.category)
auto = grouped.get_group("Autos & Vehicles")
com = grouped.get_group("Comedy")
edu = grouped.get_group("Education")
entertain = grouped.get_group("Entertainment")
film = grouped.get_group("Film & Animation")
gaming = grouped.get_group("Gaming")
howto = grouped.get_group("Howto & Style")
music = grouped.get_group("Music")
news = grouped.get_group("News & Politics")
activism = grouped.get_group("Nonprofits & Activism")
blogs = grouped.get_group("People & Blogs")
animals = grouped.get_group("Pets & Animals")
scitech = grouped.get_group("Science & Technology")
sports = grouped.get_group("Sports")
travel = grouped.get_group("Travel & Events")
auto.head()


categories = [auto, com, edu, entertain, film, gaming, howto, music, news, activism, blogs, animals, scitech, sports, travel]
names = ['auto', 'com', 'edu', 'entertain', 'film', 'gaming', 'howto', 'music', 'news', 'activism', 'blogs', 'animals', 'scitech', 'sports', 'travel']
sums = {}
for i in range(len(categories)):
    sums[names[i]] = categories[i].groupby(categories[i].trending_date.dt.month)['view_count','likes'].sum()
sums['auto']


months = {}
for key in sums:
    # key is the name of the category
        
    for index, row in sums[key].iterrows():
        if index in months:
            months[index].append((row['view_count'], key))
        else:
            months[index] = [(row['view_count'], key)]

# sorting months accordingly (1-12)
months = {key: val for key, val in sorted(months.items(), key = lambda ele: ele[0])}
#months


# top categories (by views) from each month of the year
for i in range(1,13):
  months[i].sort(reverse=True)
  print('month', i,':',months[i][0][1])

month 1 : music
month 2 : music
month 3 : music
month 4 : entertain
month 5 : music
month 6 : entertain
month 7 : entertain
month 8 : music
month 9 : music
month 10 : music
month 11 : music
month 12 : music


data = [ [1, months[1][0][0]], [3, months[3][0][0]], [5, months[5][0][0]],
         [8, months[8][0][0]], [9, months[9][0][0]], 
         [10, months[10][0][0]], [11, months[11][0][0]]]
cols = ['Jan', 'Mar','May','Aug','Sep','Oct','Nov']

df2 = pd.DataFrame(data, cols)

ax = df2.plot(kind='bar', title='Most Popular Categories per Month - Music', color=['g', 'b', 'r', 'c'])
plt.legend(['Entertainment','Music'])
ax.set_xlabel("Month of Year")
ax.set_ylabel("Views (hundreds of millions)")
plt.show()


data = [[2, months[2][0][0]], [4, months[3][0][0]], 
        [6, months[6][0][0]], [7, months[7][0][0]], [12, months[12][0][0]]]
cols = ['Feb', 'Apr','Jun','Jul','Dec']

df3 = pd.DataFrame(data, cols)

ax = df3.plot(kind='bar', title='Most Popular Categories per Month - Entertainment', color=['b', 'g', 'r', 'c'])
plt.legend(['Music','Entertainment'])
ax.set_xlabel("Month of Year")
ax.set_ylabel("Views (hundreds of millions)")
plt.show()


#controversial videos: high comments, low like ratio


df2 = pd.DataFrame()

 # for each row, if the like ratio is below 50, and comments in upper 40th percentile
for index, row in df.iterrows():
  if row['likes_ratio'] < 50 and stats.percentileofscore(df['comment_count'], row['comment_count']) >= 60: 
    df2 = df2.append(row) 

#remove duplicate titles (same video trending at a different time)
df2 = df2.drop_duplicates(subset=['title'])


### Bar plot of the number of controversial videos per category
sns.set(rc={'figure.figsize':(13,10)})
plot = sns.countplot(x="category", data=df2)
plot.set(title='Controversial Trending YouTube Videos per Category')

plt.show()


### split "News & Politics" into 'controversial' and 'non_controversial'

df['is_controversial'] = np.nan
df['is_controversial']=df['is_controversial'].astype('boolean')

for index, row in df.iterrows():
  if row['category'] == "News & Politics":
    if row['likes_ratio'] < 50 and stats.percentileofscore(df['comment_count'], row['comment_count']) >= 60: 
      df.at[index, 'is_controversial'] = True
      
    else:
      df.at[index, 'is_controversial'] = False
      
#plot the two 

df = df.dropna(subset=['is_controversial'])
sns.set(rc={'figure.figsize':(4,10)})
ax = sns.countplot(x="is_controversial", data=df)
ax.set(title = 'Controversial Videos in The News & Politics Category',xlabel='Controversial', ylabel='Count')

plt.show()

C:\Users\Steve\AppData\Local\Temp\ipykernel_13156\3310834883.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_controversial'] = np.nan
C:\Users\Steve\AppData\Local\Temp\ipykernel_13156\3310834883.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_controversial']=df['is_controversial'].astype('boolean')

	video_id	title	publishedAt	channelId	channelTitle	categoryId	trending_date	tags	view_count	likes	dislikes	comment_count	thumbnail_link	comments_disabled	ratings_disabled	description
0	3C66w5Z0ixs	I ASKED HER TO BE MY GIRLFRIEND...	2020-08-11T19:20:14Z	UCvtRTOMP2TqYqu51xNrqAzg	Brawadis	22	2020-08-12T00:00:00Z	brawadis\|prank\|basketball\|skits\|ghost\|funny vi...	1514614	156908	5855	35313	https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg	False	False	SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1	M9Pmf9AB4Mo	Apex Legends \| Stories from the Outlands – “Th...	2020-08-11T17:00:10Z	UC0ZV6M2THA81QT9hrVWJG3A	Apex Legends	20	2020-08-12T00:00:00Z	Apex Legends\|Apex Legends characters\|new Apex ...	2381688	146739	2794	16549	https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg	False	False	While running her own modding shop, Ramya Pare...
2	J78aPJ3VyNs	I left youtube for a month and THIS is what ha...	2020-08-11T16:34:06Z	UCYzPXprvl5Y-Sf0g4vX-m6g	jacksepticeye	24	2020-08-12T00:00:00Z	jacksepticeye\|funny\|funny meme\|memes\|jacksepti...	2038853	353787	2628	40221	https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg	False	False	I left youtube for a month and this is what ha...
3	kXLn3HkpjaA	XXL 2020 Freshman Class Revealed - Official An...	2020-08-11T16:38:55Z	UCbg_UMjlHJg_19SZckaKajg	XXL	10	2020-08-12T00:00:00Z	xxl freshman\|xxl freshmen\|2020 xxl freshman\|20...	496771	23251	1856	7647	https://i.ytimg.com/vi/kXLn3HkpjaA/default.jpg	False	False	Subscribe to XXL → http://bit.ly/subscribe-xxl...
4	VIUo6yapDbc	Ultimate DIY Home Movie Theater for The LaBran...	2020-08-11T15:10:05Z	UCDVPcEbVLQgLZX0Rt6jo34A	Mr. Kate	26	2020-08-12T00:00:00Z	The LaBrant Family\|DIY\|Interior Design\|Makeove...	1123889	45802	964	2196	https://i.ytimg.com/vi/VIUo6yapDbc/default.jpg	False	False	Transforming The LaBrant Family's empty white ...

	title	publishedAt	channelTitle	trending_date	tags	view_count	likes	dislikes	comment_count	comments_disabled	ratings_disabled	category
0	I ASKED HER TO BE MY GIRLFRIEND...	2020-08-11T19:20:14Z	Brawadis	2020-08-12T00:00:00Z	brawadis\|prank\|basketball\|skits\|ghost\|funny vi...	1514614	156908	5855	35313	False	False	People & Blogs
1	Apex Legends \| Stories from the Outlands – “Th...	2020-08-11T17:00:10Z	Apex Legends	2020-08-12T00:00:00Z	Apex Legends\|Apex Legends characters\|new Apex ...	2381688	146739	2794	16549	False	False	Gaming
2	I left youtube for a month and THIS is what ha...	2020-08-11T16:34:06Z	jacksepticeye	2020-08-12T00:00:00Z	jacksepticeye\|funny\|funny meme\|memes\|jacksepti...	2038853	353787	2628	40221	False	False	Entertainment
3	XXL 2020 Freshman Class Revealed - Official An...	2020-08-11T16:38:55Z	XXL	2020-08-12T00:00:00Z	xxl freshman\|xxl freshmen\|2020 xxl freshman\|20...	496771	23251	1856	7647	False	False	Music
4	Ultimate DIY Home Movie Theater for The LaBran...	2020-08-11T15:10:05Z	Mr. Kate	2020-08-12T00:00:00Z	The LaBrant Family\|DIY\|Interior Design\|Makeove...	1123889	45802	964	2196	False	False	Howto & Style
...	...	...	...	...	...	...	...	...	...	...	...	...
172185	Minecraft but I have 1,000,000 XP	2022-12-11T15:00:03Z	Craftee	2022-12-16T00:00:00Z	minecraft\|minecraft but\|new minecraft\|craftee\|...	1190457	23172	0	1497	False	False	Gaming
172186	Paddy Pimblett Octagon Interview \| UFC 282	2022-12-11T05:33:54Z	UFC - Ultimate Fighting Championship	2022-12-16T00:00:00Z	ufc\|mma\|ultimate fighting championship\|UFC 282...	1446095	11069	0	7736	False	False	Sports
172187	REVEALING Our BABY'S NAME! 😱	2022-12-10T17:46:12Z	The Royalty Family	2022-12-16T00:00:00Z	The Royalty Family\|Royalty Family\|Andrea Espad...	3103034	76518	0	18022	False	False	People & Blogs
172188	Star Wars Jedi: Survivor - Official Reveal Tra...	2022-12-09T01:52:37Z	EA Star Wars	2022-12-16T00:00:00Z	star wars\|star wars jedi survivor\|jedi survivo...	5286961	174278	0	12748	False	False	Gaming
172189	Chris Brown - It's Giving Christmas	2022-12-09T17:00:26Z	ChrisBrownVEVO	2022-12-16T00:00:00Z	Chris brown its giving Christmas\|chris brown t...	889375	60867	0	2918	False	False	Music

	title	publishedAt	channelTitle	trending_date	tags	view_count	likes	dislikes	comment_count	comments_disabled	ratings_disabled	category	likes_ratio
0	I ASKED HER TO BE MY GIRLFRIEND...	2020-08-11T19:20:14Z	Brawadis	2020-08-12T00:00:00Z	brawadis\|prank\|basketball\|skits\|ghost\|funny vi...	1514614	156908	5855	35313	False	False	People & Blogs	96.402745
1	Apex Legends \| Stories from the Outlands – “Th...	2020-08-11T17:00:10Z	Apex Legends	2020-08-12T00:00:00Z	Apex Legends\|Apex Legends characters\|new Apex ...	2381688	146739	2794	16549	False	False	Gaming	98.131516
2	I left youtube for a month and THIS is what ha...	2020-08-11T16:34:06Z	jacksepticeye	2020-08-12T00:00:00Z	jacksepticeye\|funny\|funny meme\|memes\|jacksepti...	2038853	353787	2628	40221	False	False	Entertainment	99.262657
3	XXL 2020 Freshman Class Revealed - Official An...	2020-08-11T16:38:55Z	XXL	2020-08-12T00:00:00Z	xxl freshman\|xxl freshmen\|2020 xxl freshman\|20...	496771	23251	1856	7647	False	False	Music	92.607639
4	Ultimate DIY Home Movie Theater for The LaBran...	2020-08-11T15:10:05Z	Mr. Kate	2020-08-12T00:00:00Z	The LaBrant Family\|DIY\|Interior Design\|Makeove...	1123889	45802	964	2196	False	False	Howto & Style	97.938673

	title	publishedAt	channelTitle	trending_date	tags	view_count	likes	dislikes	comment_count	comments_disabled	ratings_disabled	category	likes_ratio
0	I ASKED HER TO BE MY GIRLFRIEND...	2020-08-11T19:20:14Z	Brawadis	2020-08-12	brawadis\|prank\|basketball\|skits\|ghost\|funny vi...	1514614	156908	5855	35313	False	False	People & Blogs	96.402745
1	Apex Legends \| Stories from the Outlands – “Th...	2020-08-11T17:00:10Z	Apex Legends	2020-08-12	Apex Legends\|Apex Legends characters\|new Apex ...	2381688	146739	2794	16549	False	False	Gaming	98.131516
2	I left youtube for a month and THIS is what ha...	2020-08-11T16:34:06Z	jacksepticeye	2020-08-12	jacksepticeye\|funny\|funny meme\|memes\|jacksepti...	2038853	353787	2628	40221	False	False	Entertainment	99.262657
3	XXL 2020 Freshman Class Revealed - Official An...	2020-08-11T16:38:55Z	XXL	2020-08-12	xxl freshman\|xxl freshmen\|2020 xxl freshman\|20...	496771	23251	1856	7647	False	False	Music	92.607639
4	Ultimate DIY Home Movie Theater for The LaBran...	2020-08-11T15:10:05Z	Mr. Kate	2020-08-12	The LaBrant Family\|DIY\|Interior Design\|Makeove...	1123889	45802	964	2196	False	False	Howto & Style	97.938673

	category	view_count	likes	dislikes	comment_count	comments_disabled	likes_ratio
0	Autos & Vehicles	1914785329	88772670	1408858	7713271	0	1.707036e+05
1	Comedy	10511614324	718422436	11174623	39961401	5	5.400268e+05
2	Education	3337783703	209396163	4040488	15941268	21	2.124092e+05
3	Entertainment	65440917179	3346084989	62549431	223442458	356	1.913180e+06
4	Film & Animation	8162675980	374676085	6686643	30691115	16	3.655510e+05
5	Gaming	38245601154	2112545311	38395646	169568027	98	1.657954e+06
6	Howto & Style	4627675942	222634390	7266989	18647080	47	3.095635e+05
7	Music	83492867653	5706361753	106002834	661578759	28	1.750666e+06
8	News & Politics	6084677056	80036755	9979392	22407985	314	3.068019e+05
9	Nonprofits & Activism	131399890	11096562	112968	397858	0	8.624558e+03
10	People & Blogs	18465758586	1108635493	31225806	74720498	217	8.246492e+05
11	Pets & Animals	631401256	30929729	395656	2200316	0	4.994657e+04
12	Science & Technology	9356091709	406897672	9343119	25099157	265	3.378653e+05
13	Sports	21981874704	551473512	15843036	46554551	44	1.016712e+06
14	Travel & Events	408323428	21577650	453556	1289756	0	4.004674e+04

Introduction¶

Data Collection¶

Cleaning the Data¶

Views Vs Like Ratio¶

What types of videos are the most popular?¶

Preprocessing¶

Conclusion¶

	title	publishedAt	channelTitle	trending_date	tags	view_count	likes	dislikes	comment_count	comments_disabled	ratings_disabled	category	likes_ratio
105	$3000 vs. $500 Suspension Upgrade	2020-08-09T16:00:12Z	Donut Media	2020-08-12	donut media\|hi low\|james pumphrey\|nolan skyes\|...	1354257	62586	560	3888	False	False	Autos & Vehicles	99.113166
127	Road Rage – Why Motorcycle Riders Smash Windows	2020-08-09T15:00:01Z	FortNine	2020-08-12	motorcycle road rage compilation\|motorcycle cr...	342870	30254	580	3124	False	False	Autos & Vehicles	98.118960
162	First Shreds! Testing Our Duramax Swapped Cama...	2020-08-07T15:00:32Z	Hoonigan Project Cars	2020-08-12	Knuckle Busters\|Knuckle Busters 2\|Edelbrock\|Ed...	466200	15363	316	1272	False	False	Autos & Vehicles	97.984565
187	Ford Sticks It To Jeep By Taking The New Bronc...	2020-08-06T21:04:51Z	TFLoffroad	2020-08-12	TFLoffroad\|TFL\|2021 ford bronco\|2021\|ford bron...	511098	6700	761	2191	False	False	Autos & Vehicles	89.800295
215	We Buy a Donk Project. Goal: 1000HP! Part 1 of 69	2020-08-12T16:00:08Z	Hoonigan	2020-08-13	Donk\|Box\|Bubble\|Donk magazine\|RIDES Magazine\|C...	112081	6115	101	2021	False	False	Autos & Vehicles	98.375161

	view_count	likes
trending_date
1	113389596	4060354
2	187585344	3962879
3	71786108	2725930
4	97722045	2306466
5	82213650	5587252
6	297415169	19399228
7	83316454	4023365
8	178427336	7179923
9	203266951	15289605
10	203936100	7045607
11	191377634	8666578
12	204348942	8525483

A Big Data Analysis on YouTube Trending Videos in the United States¶

Introduction¶

Data Collection¶

Cleaning the Data¶

Views Vs Like Ratio¶

What percent of trending videos have a high like ratio¶

What types of videos are the most popular?¶

Trending categories per month¶

Preprocessing¶

Controversial Trending Videos¶

Conclusion¶