Something needed before action
需要下载chromedriver.exe
需要用到selenium,jieba,wordcloud,BeautifulSoup,xlrd,xlwt,xlutils等模块
都可以使用pip install 模块名 方式安装,如果安装失败,可以自己下载whl文件,并将whl文件
放在python的安装目录Scripts下,再通过pip install 本地地址.whl,安装所需模块
In action
抓取QQ空间说说 """ import csvimport osimport reimport timefrom collections import Counterimport jiebaimport xlrd as xlrdimport xlwt as xlwtimport matplotlib.pyplot as pltfrom bs4 import BeautifulSoupfrom numpy import arrayfrom scipy.misc import imreadfrom selenium import webdriverfrom wordcloud import WordCloud, ImageColorGenerator, STOPWORDSfrom xlutils.copy import copydef is_existed (path ): if os.path.exists(path): os.remove(path) w = xlwt.Workbook() w.add_sheet('Sheet1' ) w.save(path) def write_data (data1, data2, path ): f = xlrd.open_workbook(path) sheet = f.sheet_by_name('Sheet1' ) src = copy(f) row = sheet.nrows src.get_sheet(0 ).write(row, 0 , data1) src.get_sheet(0 ).write(row, 1 , data2) src.save(path) def get_shuoshuo (my_qq, my_pwd, friend_qq, path ): is_existed(path) driver = webdriver.Chrome() driver.maximize_window() try : driver.set_page_load_timeout(10 ) driver.get('https://user.qzone.qq.com/{}/311' .format (friend_qq)) time.sleep(3 ) except Exception: print (u'网页启动异常,请重新打开' ) time.sleep(2 ) driver.quit() try : driver.find_element_by_id('login_div' ) except Exception: print (u'非好友无法进入空间,无权限抓取内容' ) driver.quit() else : driver.switch_to.frame('login_frame' ) driver.find_element_by_id('switcher_plogin' ).click() driver.find_element_by_id('u' ).clear() driver.find_element_by_id('u' ).send_keys(my_qq) driver.find_element_by_id('p' ).clear() driver.find_element_by_id('p' ).send_keys(my_pwd) driver.find_element_by_id('login_button' ).click() time.sleep(3 ) driver.implicitly_wait(3 ) try : driver.find_element_by_id('QM_OwnerInfo_Icon' ) except Exception: print (u'空间加载异常,请重新打开' ) time.sleep(2 ) driver.quit() else : driver.switch_to.frame('app_canvas_frame' ) next_page = 'page' page = 1 try : while next_page: pages = driver.page_source soup = BeautifulSoup(pages, 'lxml' ) shuoshuo_send_times = soup.select( 'ol#msgList li.feed div.box.bgr3 > div.ft div.info a.c_tx.c_tx3.goDetail' ) shuoshuos = soup.select('ol#msgList li.feed div.bd pre.content' ) print (u'正在抓取第%d页的内容>>>>>>>>>>' % page) for i in range (len (shuoshuos)): data = { 'time' : shuoshuo_send_times[i]['title' ], 'shuos' : shuoshuos[i].text } write_data(data['time' ], data['shuos' ], path) next_page = driver.find_element_by_link_text(u'下一页' ) page = page + 1 next_page.click() time.sleep(3 ) driver.implicitly_wait(3 ) driver.quit() except Exception: print (u'抓取到%d页面结束' % page) driver.quit() def shuoshuo_analysis (file_path ): with open (file_path, 'rt' , encoding='UTF-8' ) as file: read_csv = csv.reader(file) all_moods = [mood for mood in read_csv] all_moods = array(all_moods) shuoshuos = all_moods[:, 1 ] phrases = [] for shuoshuo in shuoshuos: phrases += re.split(r'[^\u4E00-\u9FA5\w]+' , shuoshuo) phrases = list (filter (lambda phrase: phrase != '' , phrases)) words = [] for p in phrases: words += jieba.cut(p, HMM=True ) print (words) print (len (words)) print (set (words)) print (len (set (words))) print (Counter(words)) back_color = imread('pokemon.jpg' ) wc = WordCloud(background_color='white' , max_words=1000 , mask=back_color, max_font_size=100 , stopwords=STOPWORDS.add('苟利国' ), font_path="C:/Windows/Fonts/STFANGSO.ttf" , random_state=42 , ) wc.generate(' ' .join(words)) image_colors = ImageColorGenerator(back_color) plt.imshow(wc) plt.axis('off' ) plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) plt.axis('off' ) wc.to_file('wordcloud.png' ) if __name__ == '__main__' : shuoshuo_analysis('d:/me.csv' )
Something worth noting
1.python版本: 3.6.3
2.生成的csv文件,在shuoshuo_analysis()中直接调用,会报错,至少我这边是这样的
解决方案:
对打开生成的csv文件,对其另存为普通的csv文件[不是那个utf8格式的csv]
然后用记事本打开,复制里面的内容;再用sublime打开(应该是乱码的),
将复制的内容覆盖原有的乱码内容,同时save as utf-8
shuoshuo_analysis()调用这个文件
为什么不在最初就保存为utf-8格式的csv文件呢?
直接保存为csv utf-8格式,打开不会乱码,但是在读取时,第一行数据有问题,其他正常
3.制作词云图片时,选择的背景图片,最好是对比度比较明显的
Github Source Code