Is it possible to analyze customer requests and identify the causes of negative reviews in a short time and without high labor costs? In this article, we want to tell you how we managed to solve this problem with the help of ML tools.
In our work, we faced the need to assess the quality of customer service. We were faced with the task of analyzing customer requests and identifying the reasons for the occurrence of negative reviews on after-sales service for insurance products.
. - .
, , . , , .
, , , :
def cl_text(text):
c = text.lower()
c = re.sub(r'crm[^\n]+', '', c)
c = re.sub(r':\s*\d{2}\s?\d{2}\s?\d{6}\s*', '', c)
c = re.sub(r':\s*\d{2}\s?\d{2}\s?\d{6}\s*', '', c)
c = re.sub(r' ( )?:\s*\d{2}\.?\d{2}\.?\d{4}\s*', '', c)
c = re.sub(r' :\s*\d{2}\.?\d{2}\.?\d{4}\s*', '', c)
c = re.sub(r' :\s*\d{2}\.?\d{2}\.?\d{4}\s*', '', c)
c = re.sub(r' :\s*\d{2}\.?\d{2}\.?\d{4}\s*', '', c)
c = re.sub(r' :[\S\W]\w*', '', c)
c = re.sub(r'\n+', ' ', c)
c = re.sub(r'\s+', ' ', c)
c = re.sub(r"[A-Za-z!#$%&'()*+,./:;<=>?@[\]^_`{|}~โ\"\-]+", ' ', c)
return c.strip()
, , . - NLTK stopwords:
import pymorphy2
import nltk
morph = pymorphy2.MorphAnalyzer()
stopwords = nltk.corpus.stopwords.words('russian')
stopwords.extend(['','','','','',''])
def lemmatize(text):
text = re.sub(r"\d+", '', text.lower()) #
for token in text.split():
token = token.strip()
token = morph.normal_forms(token)[0].replace('', '')
if token and token not in stopwords: tokens.append(token)
if len(tokens) > 2: ' '.join(tokens)
return None
:
|
|
'CRM+XX.XX.XXXX XXXXXXXXXXXXX *** \n : XX.XX.XXXX\n .. [] , , , . \n : \n: XX XX XXXXXX' |
' ' |
, , -. OneHotEncoding TF-IDF. , , โ . , , , ยซ ยป ยซ ยป , .. .
Universal Sentence Embedder, , . . .
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
model = hub.load(r'/UniverseSentenseEmbeddings/USEv3')
embedding = model(โ โ)
, 5 . , :
input1, input2 = [' '], [' ', ' ', ' ', ' ', ' ']
emb1, emb2 = model(input1), model(input2)
results_cosine = pairwise.cosine_similarity(emb1, emb2).tolist()[0]
for i, res in enumerate(results_cosine):
print('"{}" <> "{}", cos_sim={:.3f}'.format(input1[0],input2[i],results_cosine[i]))
:
" " <> " ", cos_sim = 0.860
" " <> " ", cos_sim = 0.769
" " <> " ", cos_sim = 0.748
" " <> " ", cos_sim = 0.559
" " <> " ", cos_sim = 0.192
, , .
4 : DBSCAN, , kMeans MiniBatchKMeans. , .., , :
from sklearn.cluster import AgglomerativeClustering
num_clusters = 5
agglo1 = AgglomerativeClustering(n_clusters=num_clusters, affinity='euclidean') #cosine, l1, l2, manhattan
get_ipython().magic('time answer = agglo1.fit_predict(sent_embs)')
5 , , . โ 10 :
cl = {}
for cluster, data in tqdm(report.groupby('AGGLOM'), desc=method):
arr = ' '.join(data[''].values).split()
arr_morph = []
for k in arr:
arr_morph.append(morph.parse(k)[0].normal_form)
cl[method+'_'+str(cluster)] = Counter([x.replace('', '') for x in arr_morph if x not in stopwords]).most_common(10)
- :
|
10 |
|
AGGLOM_0 |
[('', 1548), ('', 786), ('', 565), ('', 552), ('', 494), ('', 427), ('', 371), ('', 73), ('', 45), ('', 40)] |
|
AGGLOM_1 |
[('', 2984), ('', 2627), ('', 2205), ('', 2144), ('', 1932), ('', 1931), ('', 1688), ('', 1653), ('', 1571), ('', 1460)] |
, , |
AGGLOM_2 |
[('', 3807), ('', 540), ('', 443), ('', 370), ('', 351), ('', 312), ('', 290), ('', 275), ('', 272), ('', 264)] |
|
AGGLOM_3 |
[('', 1100), ('', 683), ('', 660), ('', 440), ('', 428), ('', 329), ('', 315), ('', 303), ('', 292), ('', 287)] |
, . |
AGGLOM_4 |
[('', 459), ('', 459), ('', 386), ('', 383), ('', 266), ('', 196), ('', 184), ('', 142), ('', 5), ('', 5)] |
|
, , .
, , .
, .