為什么電腦打不開word文檔(為什么電腦打不開word文檔可以打開表格)
887
2025-03-31
百度飛槳學習——七日打卡作業(yè)(五)大作業(yè)
這次的作業(yè)是百度結(jié)營大作業(yè),難度主要在代碼小細節(jié)方面
主要內(nèi)容
第一步:愛奇藝《青春有你2》評論數(shù)據(jù)爬取(參考鏈接:https://www.iqiyi.com/v_19ryfkiv8w.html#curid=15068699100_9f9bab7e0d1e30c494622af777f4ba39)
爬取任意一期正片視頻下評論
評論條數(shù)不少于1000條
第二步:詞頻統(tǒng)計并可視化展示
數(shù)據(jù)預處理:清理清洗評論中特殊字符(如:@#¥%、emoji表情符),清洗后結(jié)果存儲為txt文檔
中文分詞:添加新增詞(如:青你、奧利給、沖鴨),去除停用詞(如:哦、因此、不然、也好、但是)
統(tǒng)計top10高頻詞
可視化展示高頻詞
第三步:繪制詞云
根據(jù)詞頻生成詞云
可選項-添加背景圖片,根據(jù)背景圖片輪廓生成詞云
第四步:結(jié)合PaddleHub,對評論進行內(nèi)容審核
需要的配置和準備
中文分詞需要jieba
詞云繪制需要wordcloud
可視化展示中需要的中文字體
網(wǎng)上公開資源中找一個中文停用詞表
根據(jù)分詞結(jié)果自己制作新增詞表
準備一張詞云背景圖(附加項,不做要求,可用hub摳圖實現(xiàn))
paddlehub配置
環(huán)境安裝
!pip install jieba !pip install wordcloud -i https://pypi.tuna.tsinghua.edu.cn/simple
1
2
# Linux系統(tǒng)默認字體文件路徑 !ls /usr/share/fonts/ # 查看系統(tǒng)可用的ttf格式中文字體 !fc-list :lang=zh | grep ".ttf"
1
2
3
4
!wget https://mydueros.cdn.bcebos.com/font/simhei.ttf # 下載中文字體 #創(chuàng)建字體目錄fonts !mkdir .fonts # 復制字體文件到該路徑 !cp simhei.ttf .fonts/
1
2
3
4
5
#安裝模型 !hub install porn_detection_lstm==1.1.0 !pip install --upgrade paddlehub
1
2
3
編寫代碼
from __future__ import print_function import requests import json import re #正則匹配 import time #時間處理模塊 import jieba #中文分詞 import numpy as np import matplotlib import matplotlib.pyplot as plt import matplotlib.font_manager as font_manager from PIL import Image from wordcloud import WordCloud #繪制詞云模塊 import paddlehub as hub
1
2
3
4
5
6
7
8
9
10
11
12
13
#請求愛奇藝評論接口,返回response信息 def getMovieinfo(url): ''' 請求愛奇藝評論接口,返回response信息 參數(shù) url: 評論的url :return: response信息 ''' session = requests.Session() headers = { "User-Agent":"Mozilla/5.0", "Accept":"application/json", "Referer":"http://m.iqiyi.com/v_19rqriflzg.html", "Origin":"http://m.iqiyi.com", "Host":"sns-comment.iqiyi.com", "Connection":"keep-alive", "Accept-Language":"en-us,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh_TW;q=0.6", "Accept-Encoding":"gzip,deflate" } response =session.get(url,headers=headers) if response.status_code == 200: return response.text return None #解析json數(shù)據(jù),獲取評論 def saveMovieInfoToFile(lastId,arr): ''' 解析json數(shù)據(jù),獲取評論 參數(shù) lastId:最后一條評論ID arr:存放文本的list :return: 新的lastId ''' url = "https://sns-comment.iqiyi.com/v3/comment/get_comments.action?agent_type=118&agent_version=9.11.5&business_type=17&content_id=15068699100&page=&page_size=10&types=time&last_id=" url += str(lastId) responseTxt = getMovieinfo(url) responseJson = json.loads(responseTxt) comments = responseJson['data']['comments'] for val in comments: if 'content' in val.keys(): # print(val['content']) arr.append(val['content']) lastId = str(val['id']) return lastId
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#去除文本中特殊字符 def clear_special_char(content): ''' 正則處理特殊字符 參數(shù) content:原文本 return: 清除后的文本 ''' s = re.sub(r"?(.+?)>| |\t|\r","",content) s = re.sub(r"\n"," ",s) s = re.sub(r"\*","\*",s) s = re.sub('[^\u4e00-\u9fa5^a-z^A-Z^0-9]','',s) s = re.sub('[
#去除文本中特殊字符 def clear_special_char(content): ''' 正則處理特殊字符 參數(shù) content:原文本 return: 清除后的文本 ''' s = re.sub(r"?(.+?)>| |\t|\r","",content) s = re.sub(r"\n"," ",s) s = re.sub(r"\*","\\*",s) s = re.sub('[^\u4e00-\u9fa5^a-z^A-Z^0-9]','',s) s = re.sub('[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+','',s) s = re.sub('[a-zA-Z]','',s) s = re.sub('^\d+(\.\d+)?$','',s) return s
1#去除文本中特殊字符 def clear_special_char(content): ''' 正則處理特殊字符 參數(shù) content:原文本 return: 清除后的文本 ''' s = re.sub(r"?(.+?)>| |\t|\r","",content) s = re.sub(r"\n"," ",s) s = re.sub(r"\*","\\*",s) s = re.sub('[^\u4e00-\u9fa5^a-z^A-Z^0-9]','',s) s = re.sub('[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+','',s) s = re.sub('[a-zA-Z]','',s) s = re.sub('^\d+(\.\d+)?$','',s) return s
2#去除文本中特殊字符 def clear_special_char(content): ''' 正則處理特殊字符 參數(shù) content:原文本 return: 清除后的文本 ''' s = re.sub(r"?(.+?)>| |\t|\r","",content) s = re.sub(r"\n"," ",s) s = re.sub(r"\*","\\*",s) s = re.sub('[^\u4e00-\u9fa5^a-z^A-Z^0-9]','',s) s = re.sub('[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+','',s) s = re.sub('[a-zA-Z]','',s) s = re.sub('^\d+(\.\d+)?$','',s) return s
3#去除文本中特殊字符 def clear_special_char(content): ''' 正則處理特殊字符 參數(shù) content:原文本 return: 清除后的文本 ''' s = re.sub(r"?(.+?)>| |\t|\r","",content) s = re.sub(r"\n"," ",s) s = re.sub(r"\*","\\*",s) s = re.sub('[^\u4e00-\u9fa5^a-z^A-Z^0-9]','',s) s = re.sub('[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+','',s) s = re.sub('[a-zA-Z]','',s) s = re.sub('^\d+(\.\d+)?$','',s) return s
4#去除文本中特殊字符 def clear_special_char(content): ''' 正則處理特殊字符 參數(shù) content:原文本 return: 清除后的文本 ''' s = re.sub(r"?(.+?)>| |\t|\r","",content) s = re.sub(r"\n"," ",s) s = re.sub(r"\*","\\*",s) s = re.sub('[^\u4e00-\u9fa5^a-z^A-Z^0-9]','',s) s = re.sub('[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+','',s) s = re.sub('[a-zA-Z]','',s) s = re.sub('^\d+(\.\d+)?$','',s) return s
5#去除文本中特殊字符 def clear_special_char(content): ''' 正則處理特殊字符 參數(shù) content:原文本 return: 清除后的文本 ''' s = re.sub(r"?(.+?)>| |\t|\r","",content) s = re.sub(r"\n"," ",s) s = re.sub(r"\*","\\*",s) s = re.sub('[^\u4e00-\u9fa5^a-z^A-Z^0-9]','',s) s = re.sub('[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+','',s) s = re.sub('[a-zA-Z]','',s) s = re.sub('^\d+(\.\d+)?$','',s) return s
6#去除文本中特殊字符 def clear_special_char(content): ''' 正則處理特殊字符 參數(shù) content:原文本 return: 清除后的文本 ''' s = re.sub(r"?(.+?)>| |\t|\r","",content) s = re.sub(r"\n"," ",s) s = re.sub(r"\*","\\*",s) s = re.sub('[^\u4e00-\u9fa5^a-z^A-Z^0-9]','',s) s = re.sub('[\001\002\003\004\005\006\007\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+','',s) s = re.sub('[a-zA-Z]','',s) s = re.sub('^\d+(\.\d+)?$','',s) return s
7\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a]+','',s) s = re.sub('[a-zA-Z]','',s) s = re.sub('^\d+(\.\d+)?$','',s) return s1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def fenci(text): ''' 利用jieba進行分詞 參數(shù) text:需要分詞的句子或文本 return:分詞結(jié)果 ''' jieba.load_userdict("add_words.txt") seg = jieba.lcut(text,cut_all=False) return seg
1
2
3
4
5
6
7
8
9
def stopwordslist(file_path): ''' 創(chuàng)建停用詞表 參數(shù) file_path:停用詞文本路徑 return:停用詞list ''' stopwords = [line.strip() for line in open(file_path,encoding='UTF-8').readlines()] return stopwords
1
2
3
4
5
6
7
8
def movestopwords(sentence,stopwords,counts): ''' 去除停用詞,統(tǒng)計詞頻 參數(shù) file_path:停用詞文本路徑 stopwords:停用詞list counts: 詞頻統(tǒng)計結(jié)果 return:None ''' out = [] for word in sentence: if word not in stopwords: if len(word) != 1: counts[word] = counts.get(word,0)+1 return None
1
2
3
4
5
6
7
8
9
10
11
12
13
def drawcounts(counts,num): ''' 繪制詞頻統(tǒng)計表 參數(shù) counts: 詞頻統(tǒng)計結(jié)果 num:繪制topN return:none ''' x_aixs =[] y_aixs =[] c_order = sorted(counts.items(),key=lambda x:x[1],reverse=True) for c in c_order[:num]: x_aixs.append(c[0]) y_aixs.append(c[1]) matplotlib.rcParams['font.sans-serif'] = ['SimHei'] matplotlib.rcParams['axes.unicode_minus'] = False plt.bar(x_aixs,y_aixs) plt.title('詞頻統(tǒng)計結(jié)果') plt.show() #return
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def drawcloud(word_f): ''' 根據(jù)詞頻繪制詞云圖 參數(shù) word_f:統(tǒng)計出的詞頻結(jié)果 return:none ''' cloud_mask = np.array(Image.open('china.png')) st = set(["東西","這是"]) wc = WordCloud(background_color='white', mask=cloud_mask, max_words=150, font_path='simhei.ttf', min_font_size=10, max_font_size=100, width=400, relative_scaling=0.3, stopwords=st) #wc.fit_words(word_f) wc.generate_from_frequencies(word_f) wc.to_file('pic.png')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def text_detection(text,file_path): ''' 使用hub對評論進行內(nèi)容分析 return:分析結(jié)果 ''' porn_detection_lstm = hub.Module(name="porn_detection_lstm") f = open('aqy.txt','r',encoding='utf-8') for line in f: if len(line.strip())==1: continue else: test_text.append(line) f.close() input_dict = {"text":test_text} results = porn_detection_lstm.detection(data=input_dict,use_gpu=True,batch_size=1) # print(results) for index,item in enumerate(results): if item['porn_detection_key']=='porn': print(item['text'],':',item['porn_probs'])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#評論是多分頁的,得多次請求愛奇藝的評論接口才能獲取多頁評論,有些評論含有表情、特殊字符之類的 #num 是頁數(shù),一頁10條評論,假如爬取1000條評論,設(shè)置num=100 if __name__ == "__main__": num = 300 lastId = '0' arr = [ ] with open('aqy.txt','a',encoding='utf-8') as f: for i in range(num): lastId = saveMovieInfoToFile(lastId,arr) # for i in arr: # print("arr的內(nèi)容是:",i) ##正常運行 time.sleep(0.5) for item in arr: Item = clear_special_char(item) # print("Item的內(nèi)容是",Item) # print("Item類型是",type(Item)) if Item.strip()!='': try: f.write(Item+'\n') #print() except Exception as e: print(e) #print("含特殊字符") print('共取評論:',len(arr)) f = open('aqy.txt','r',encoding='utf-8') counts = {} for line in f: words = fenci(line) stopwords = stopwordslist('cn_stopwords.txt') movestopwords(words,stopwords,counts) #print("counts 的類型是",type(counts)) drawcounts(counts,10) drawcloud(counts) f.close() file_path = 'aqy.txt' test_text = [] text_detection(test_text,file_path) display(Image.open('pic.png')) #顯示生成的詞云圖像
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
JSON
版權(quán)聲明:本文內(nèi)容由網(wǎng)絡(luò)用戶投稿,版權(quán)歸原作者所有,本站不擁有其著作權(quán),亦不承擔相應法律責任。如果您發(fā)現(xiàn)本站中有涉嫌抄襲或描述失實的內(nèi)容,請聯(lián)系我們jiasou666@gmail.com 處理,核實后本網(wǎng)站將在24小時內(nèi)刪除侵權(quán)內(nèi)容。
版權(quán)聲明:本文內(nèi)容由網(wǎng)絡(luò)用戶投稿,版權(quán)歸原作者所有,本站不擁有其著作權(quán),亦不承擔相應法律責任。如果您發(fā)現(xiàn)本站中有涉嫌抄襲或描述失實的內(nèi)容,請聯(lián)系我們jiasou666@gmail.com 處理,核實后本網(wǎng)站將在24小時內(nèi)刪除侵權(quán)內(nèi)容。