Something needed before action
需要使用到lxml和beautifulsoup,都可以使用pip安装
In action
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
|
""" 抓取网易云音乐 """ import urllib.request
from bs4 import BeautifulSoup
def get_html(url, headers): req = urllib.request.Request(url, headers=headers) with urllib.request.urlopen(req) as resp: content = resp.read().decode('utf-8') return content
def parse_html(html): host = 'https://music.163.com' soup = BeautifulSoup(html, 'lxml') playlist_img = soup.select('ul#m-pl-container li div img') playlist_name = soup.select('ul#m-pl-container li div a.msk') playlist_views = soup.select('ul#m-pl-container li div.bottom span.nb') playlist_creator = soup.select('ul#m-pl-container li p > span + a') for i in range(len(playlist_creator)): print('歌单封面: ', playlist_img[i]['src']) print('歌单名称: ', playlist_name[i]['title']) print('歌单链接: ', host + playlist_name[i]['href']) print('歌单播放量: ', playlist_views[i].text) print('歌单创建者: ', playlist_creator[i]['title']) print('创建者主页: ', host + playlist_creator[i]['href'], '\n')
if __name__ == '__main__': spider_url = 'https://music.163.com/discover/playlist' result = get_html(spider_url, headers={ 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)', 'Host': 'music.163.com' }) parse_html(result)
|
Something worth noting
1.python版本: 3.6.3
2.可以结合前一篇,做个歌词分析
Github Source Code