Something needed before action
需要下载chromedriver.exe
需要用到selenium,jieba,wordcloud,BeautifulSoup,xlrd,xlwt,xlutils等模块
都可以使用pip install 模块名 方式安装,如果安装失败,可以自己下载whl文件,并将whl文件
放在python的安装目录Scripts下,再通过pip install 本地地址.whl,安装所需模块
In action
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 """ 抓取QQ空间说说 """ import csvimport osimport reimport timefrom collections import Counterimport jiebaimport xlrd as xlrdimport xlwt as xlwtimport matplotlib.pyplot as pltfrom bs4 import BeautifulSoupfrom numpy import arrayfrom scipy.misc import imreadfrom selenium import webdriverfrom wordcloud import WordCloud, ImageColorGenerator, STOPWORDSfrom xlutils.copy import copydef is_existed (path ): if os.path.exists(path): os.remove(path) w = xlwt.Workbook() w.add_sheet('Sheet1' ) w.save(path) def write_data (data1, data2, path ): f = xlrd.open_workbook(path) sheet = f.sheet_by_name('Sheet1' ) src = copy(f) row = sheet.nrows src.get_sheet(0 ).write(row, 0 , data1) src.get_sheet(0 ).write(row, 1 , data2) src.save(path) def get_shuoshuo (my_qq, my_pwd, friend_qq, path ): is_existed(path) driver = webdriver.Chrome() driver.maximize_window() try : driver.set_page_load_timeout(10 ) driver.get('https://user.qzone.qq.com/{}/311' .format (friend_qq)) time.sleep(3 ) except Exception: print (u'网页启动异常,请重新打开' ) time.sleep(2 ) driver.quit() try : driver.find_element_by_id('login_div' ) except Exception: print (u'非好友无法进入空间,无权限抓取内容' ) driver.quit() else : driver.switch_to.frame('login_frame' ) driver.find_element_by_id('switcher_plogin' ).click() driver.find_element_by_id('u' ).clear() driver.find_element_by_id('u' ).send_keys(my_qq) driver.find_element_by_id('p' ).clear() driver.find_element_by_id('p' ).send_keys(my_pwd) driver.find_element_by_id('login_button' ).click() time.sleep(3 ) driver.implicitly_wait(3 ) try : driver.find_element_by_id('QM_OwnerInfo_Icon' ) except Exception: print (u'空间加载异常,请重新打开' ) time.sleep(2 ) driver.quit() else : driver.switch_to.frame('app_canvas_frame' ) next_page = 'page' page = 1 try : while next_page: pages = driver.page_source soup = BeautifulSoup(pages, 'lxml' ) shuoshuo_send_times = soup.select( 'ol#msgList li.feed div.box.bgr3 > div.ft div.info a.c_tx.c_tx3.goDetail' ) shuoshuos = soup.select('ol#msgList li.feed div.bd pre.content' ) print (u'正在抓取第%d页的内容>>>>>>>>>>' % page) for i in range (len (shuoshuos)): data = { 'time' : shuoshuo_send_times[i]['title' ], 'shuos' : shuoshuos[i].text } write_data(data['time' ], data['shuos' ], path) next_page = driver.find_element_by_link_text(u'下一页' ) page = page + 1 next_page.click() time.sleep(3 ) driver.implicitly_wait(3 ) driver.quit() except Exception: print (u'抓取到%d页面结束' % page) driver.quit() def shuoshuo_analysis (file_path ): with open (file_path, 'rt' , encoding='UTF-8' ) as file: read_csv = csv.reader(file) all_moods = [mood for mood in read_csv] all_moods = array(all_moods) shuoshuos = all_moods[:, 1 ] phrases = [] for shuoshuo in shuoshuos: phrases += re.split(r'[^\u4E00-\u9FA5\w]+' , shuoshuo) phrases = list (filter (lambda phrase: phrase != '' , phrases)) words = [] for p in phrases: words += jieba.cut(p, HMM=True ) print (words) print (len (words)) print (set (words)) print (len (set (words))) print (Counter(words)) back_color = imread('pokemon.jpg' ) wc = WordCloud(background_color='white' , max_words=1000 , mask=back_color, max_font_size=100 , stopwords=STOPWORDS.add('苟利国' ), font_path="C:/Windows/Fonts/STFANGSO.ttf" , random_state=42 , ) wc.generate(' ' .join(words)) image_colors = ImageColorGenerator(back_color) plt.imshow(wc) plt.axis('off' ) plt.figure() plt.imshow(wc.recolor(color_func=image_colors)) plt.axis('off' ) wc.to_file('wordcloud.png' ) if __name__ == '__main__' : shuoshuo_analysis('d:/me.csv' )
Something worth noting
1.python版本: 3.6.3
2.生成的csv文件,在shuoshuo_analysis()中直接调用,会报错,至少我这边是这样的
解决方案:
对打开生成的csv文件,对其另存为普通的csv文件[不是那个utf8格式的csv]
然后用记事本打开,复制里面的内容;再用sublime打开(应该是乱码的),
将复制的内容覆盖原有的乱码内容,同时save as utf-8
shuoshuo_analysis()调用这个文件
为什么不在最初就保存为utf-8格式的csv文件呢?
直接保存为csv utf-8格式,打开不会乱码,但是在读取时,第一行数据有问题,其他正常
3.制作词云图片时,选择的背景图片,最好是对比度比较明显的
Github Source Code