Je suis entrain de déployer mon premier projet machine learning en utilisant le framework flask (le projet s'agit de la classification multiclasse des données textuelle) lorsque je saisi mon texte et j'effectue la prédiction voilà ce qu'il maffiche: "ValueError: empty vocabulary; perhaps the documents only contain stop words";
y'a t'il qqu'un qui peut m'aider svp
Voila le code source :
app.py
import numpy as np
from flask import Flask, request, jsonify, render_template
from flask_bootstrap import Bootstrap
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn import svm
import pandas as pd
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
import re
import nltk
import gensim
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
app= Flask(__name__)
model = pickle.load(open('model.pkl', 'rb'))
@app.route('/')
def home():
return render_template('home.html')
@app.route('/predict', methods=["POST"])
def predict():
'''
pour rendre le resultat en HTML GUI
'''
if request.method == "POST":
review = request.form['review']
# Lets do some text cleanup
REPLACE_BY_SPACE_RE = re.compile("[/(){}\.?!&$€’`<>-_[\]\|@,;]#“”&%'~*°")
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REMOVE_NUM = re.compile('[\d+]')
#STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
# lowercase text
text = [[word.lower() for word in txt.split()] for txt in text]
# replace ('[/(){}\[\]\|@,;]') symbols by space in text
text = REPLACE_BY_SPACE_RE.sub(' ', str(text))
# Remove white space
text = REMOVE_NUM.sub(' ', text)
# delete symbols which are in BAD_SYMBOLS_RE ('[^0-9a-z #+_]') from text
text = BAD_SYMBOLS_RE.sub(' ', text)
# delete stopwords from text
word_tokens = word_tokenize(text)
text = ' '.join(word for word in word_tokens if word not in STOPWORDS)
# removes any words composed of less than 2 or more than 21 letters
text = ' '.join(word for word in text.split() if (len(word) >= 2 and len(word) <= 21))
return text
review= clean_text(str(review))
# POS tagger dictionary
pos_dict = {'AJ':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'Av':wordnet.ADV}
def token_stop_pos(text):
tags = pos_tag(word_tokenize(text))
newlist = []
for word, tag in tags:
newlist.append(tuple([word, pos_dict.get(tag[0])]))
return newlist
review= token_stop_pos(review)
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(pos_data):
lemma_rew = " "
for word, pos in pos_data:
if not pos:
lemma = word
lemma_rew = lemma_rew + " " + lemma
else:
lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
lemma_rew = lemma_rew + " " + lemma
return lemma_rew
review= lemmatize(review)
def removeNonEnglishWordsFunct(x):
words = set(nltk.corpus.words.words())
filteredSentence = " ".join(w for w in nltk.wordpunct_tokenize(x)\
if w.lower() in words or not w.isalpha())
return filteredSentence
review =removeNonEnglishWordsFunct(review)
# Extract Feature With TF-IDF
cv = TfidfVectorizer(ngram_range=(1, 1))
data = [review]
vect = cv.fit_transform(data).toarray()
my_prediction = model.predict(vect)
return render_template('result.html', prediction =my_prediction)
if __name__ =='__main__':
app.run(debug=True)
- Edité par Jihenhs 27 novembre 2021 à 4:03:14
Déploiement fu modèle machine learning avec flask
× Après avoir cliqué sur "Répondre" vous serez invité à vous connecter pour que votre message soit publié.
× Attention, ce sujet est très ancien. Le déterrer n'est pas forcément approprié. Nous te conseillons de créer un nouveau sujet pour poser ta question.