Download the dataset from https://archive.ics.uci.edu/ml/machine-learning-databases/00380/
I've made this project a VS Code Jupyter shell but you can make it in any Jupyter Notebook or even in any python file
Here's the whole syntax of the project for you-
Preview:
import pandas as pd import numpy as np import sklearn # load all the datasets and concat them into one. data= pd.concat([pd.read_csv("C:\data\spam comments/Youtube01-Psy.csv"), pd.read_csv("C:\data\spam comments/Youtube02-KatyPerry.csv"), pd.read_csv("C:\data\spam comments/Youtube03-LMFAO.csv"), pd.read_csv("C:\data\spam comments/Youtube04-Eminem.csv"), pd.read_csv("C:\data\spam comments/Youtube05-Shakira.csv")]) data.head() # now our data has comments of 5 different youtube vids, now lets check the spam and not spam comments print(len(data[data["CLASS"]==0])) # NOT SPAM print(len(data[data["CLASS"]==1])) #SPAM # shuuffle the dataset and seperate comments(content) and CLASS(spam or not spam)(label) data_shuf = data.sample(frac=1) data_content = data_shuf["CONTENT"] data_label = data_shuf["CLASS"] # we are gonna use pipeline feature to apply our countvector and randomforests at the same time # gonna set up the pipeline in this cell # we are using randomforestclassifier to train our dataset as its most efficient in this case # we are also using countvectorizer whose main function is to break down a sentence or a paragraph into mere words and performing very basic processing like removing all the words with punctuation marks, converting all words to lower case etc. from sklearn.pipeline import Pipeline, make_pipeline from sklearn.feature_extraction.text import CountVectorizer from sklearn.ensemble import RandomForestClassifier pipeline = Pipeline([ ("bag of words", CountVectorizer()), ("random forest", RandomForestClassifier()) ]) # now we are gonna add countvectorizer in our randomforest make_pipeline(CountVectorizer(), RandomForestClassifier()) pipeline.fit(data_content[:1500], data_label[:1500]) # this is our training data ( which is upto 1500th row) pipeline.score(data_content[1500:], data_label[1500:]) # this is our testing data (which is after the 1500th row) # testing our model pipeline.predict(["subscribe to my channel"]) # 1 means SPAM pipeline.predict(["great video man, you are so good"]) # 0 means NOT SPAM pipeline.predict(["click on my video"])
0 Comments