import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from  nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.accuracy import rmse, mae
from surprise.model_selection import GridSearchCV
import multiprocessing


def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ronal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ronal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ronal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ronal\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

True


# Load the data
df = pd.read_csv("./data/training.1600000.processed.noemoticon.csv",names=['target','ids','date','flag','user','text']
                 ,encoding="ISO-8859-1"
                 ,dtype={'target':'int','ids':'int','date':'string','flag':'string','user':'string','text':'string'}
                )
display(df.head())
display(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int32 
 1   ids     1600000 non-null  int32 
 2   date    1600000 non-null  string
 3   flag    1600000 non-null  string
 4   user    1600000 non-null  string
 5   text    1600000 non-null  string
dtypes: int32(2), string(4)
memory usage: 61.0 MB

None


decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]


%%time
df.target = df.target.apply(lambda x: decode_sentiment(x))

CPU times: total: 406 ms
Wall time: 485 ms


stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"


%%time
df["clean_text"] = df["text"].apply(lambda x: preprocess(x))

CPU times: total: 44.1 s
Wall time: 45 s


%%time

# Train a sentiment analysis model
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"], df["target"], test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 3), stop_words="english", use_idf=True, smooth_idf=True, sublinear_tf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

CPU times: total: 1min 4s
Wall time: 1min 4s


%%time 

svm_classifier = SVC(kernel="linear", max_iter=1000, tol=0.01)

#We can then fit the SVM classifier to the TF-IDF features and the target column:
svm_classifier.fit(X_train_tfidf, y_train)

y_pred = svm_classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Sentiment analysis accuracy:", accuracy)

report = classification_report(y_test, y_pred)
print("Sentiment analysis classification report:\n", report)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Sentiment analysis confusion matrix:\n", conf_matrix)

c:\Users\ronal\anaconda3\lib\site-packages\sklearn\svm\_base.py:299: ConvergenceWarning: Solver terminated early (max_iter=1000).  Consider pre-processing your data with StandardScaler or MinMaxScaler.
  warnings.warn(

Sentiment analysis accuracy: 0.530746875
Sentiment analysis classification report:
               precision    recall  f1-score   support

    NEGATIVE       0.69      0.11      0.19    159494
    POSITIVE       0.52      0.95      0.67    160506

    accuracy                           0.53    320000
   macro avg       0.60      0.53      0.43    320000
weighted avg       0.60      0.53      0.43    320000

Sentiment analysis confusion matrix:
 [[ 17049 142445]
 [  7716 152790]]
CPU times: total: 2min 20s
Wall time: 2min 22s


target_map = {'NEGATIVE': 0, 'POSITIVE': 1}
df['target'] = df['target'].map(target_map)


# Train a product recommendation model
reader = Reader(rating_scale=(0, 4))
data = Dataset.load_from_df(df[["user", "ids", "target"]], reader)
trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

param_grid = {"n_epochs": [10, 20], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
grid_search = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
grid_search.fit(data)

best_rmse = grid_search.best_score["rmse"]
best_mae = grid_search.best_score["mae"]
best_params = grid_search.best_params["rmse"]

print("Product recommendation RMSE:", best_rmse)
print("Product recommendation MAE:", best_mae)
print("Product recommendation best parameters:", best_params)

algo = SVD(n_epochs=best_params["n_epochs"], lr_all=best_params["lr_all"], reg_all=best_params["reg_all"], verbose=False)
algo.fit(trainset)
predictions = algo.test(testset)

rmse_score = rmse(predictions)
mae_score = mae(predictions)

print("Product recommendation RMSE:", rmse_score)
print("Product recommendation MAE:", mae_score)

# Combine sentiment analysis and product recommendation
df["predicted_sentiment"] = svm_classifier.predict(tfidf_vectorizer.transform(df["clean_text"]))
df["predicted_rating"] = df.apply(lambda row: algo.predict(row["user"], row["ids"]).est, axis=1)

# Save the results to a new CSV file
df.to_csv("customer_feedback_with_predictions.csv", index=False)

Product recommendation RMSE: 0.4772411223324102
Product recommendation MAE: 0.468828564792572
Product recommendation best parameters: {'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.4}
RMSE: 0.4753
MAE:  0.4660
Product recommendation RMSE: 0.47527843956146315
Product recommendation MAE: 0.4659663620618836


def recommend(user):
    # Get the list of item IDs the user has already rated
    rated_ids = list(df.loc[df["user"] == user, "ids"])
    # Get the list of all item IDs
    all_ids = list(df["ids"].unique())
    # Remove the already rated items from the list of all items
    item_ids = [id for id in all_ids if id not in rated_ids]
    # Create a list of (item ID, predicted rating) tuples for the user
    item_ratings = [(id, algo.predict(user, id).est) for id in item_ids]
    # Sort the items by predicted rating (descending order)
    item_ratings.sort(key=lambda x: x[1], reverse=True)
    # Return the top 5 recommended items
    top_items = item_ratings[:5]
    return top_items


user = "john123"
recommended_items = recommend(user)
print("Recommended items for user", user, ":")
for item in recommended_items:
    print("- Item ID:", item[0], "- Predicted rating:", item[1])

Recommended items for user john123 :
- Item ID: 1966876510 - Predicted rating: 0.58289153336724
- Item ID: 1693433586 - Predicted rating: 0.5789923239998461
- Item ID: 1957741658 - Predicted rating: 0.5787092243641766
- Item ID: 2055182704 - Predicted rating: 0.5786675266045753
- Item ID: 1881797143 - Predicted rating: 0.5784505736784116

	ids	date	flag	user	text
0	1467810369	Mon Apr 06 22:19:45 PDT 2009	NO_QUERY	_TheSpecialOne_	@switchfoot http://twitpic.com/2y1zl - Awww, t...
1	1467810672	Mon Apr 06 22:19:49 PDT 2009	NO_QUERY	scotthamilton	is upset that he can't update his Facebook by ...
2	1467810917	Mon Apr 06 22:19:53 PDT 2009	NO_QUERY	mattycus	@Kenichan I dived many times for the ball. Man...
3	1467811184	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	ElleCTF	my whole body feels itchy and like its on fire
4	1467811193	Mon Apr 06 22:19:57 PDT 2009	NO_QUERY	Karoli	@nationwideclass no, it's not behaving at all....

Sentiment Analysis and Product Recommendation¶

The Data: Customer Feedback¶

Step 1: Text Preprocessing¶

Step 3: Sentiment Analysis¶

Step 4: Model Selection and Evaluation¶

Make personalized recommendations for a user¶

Test the recommendation function¶

Conclusion¶