- We first convert the comments to lower-case and then use custom made functions to remove html-tags, punctuation and non-alphabetic characters from the comments.
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warningsdata = data_rawif not sys.warnoptions:
warnings.simplefilter("ignore")def cleanHtml(sentence):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', str(sentence))
return cleantextdef cleanPunc(sentence): #function to clean the word of any punctuation or special characters
cleaned
Visit source site to finish reading.