1.微博登录
2.爬取数据
3.mysql存储
4.pyechart本地展示
5.用vue搭建网站web展示
先放图:
1.微博登录
新浪微博的登录不是简单的post就能解决的,他的登录有加密,所以我们要使用一定的解密算法才能正常登录微博,得到我们想要的数据。
先不要慌,第一步当然是import我们需要的库
from urllib import request,parseimport http.cookiejarimport base64import jsonimport rsaimport binasciifrom PIL import Imagefrom bs4 import BeautifulSoupimport pymysql
要登录,那肯定要先拿到验证码
def GetCode(self): url="https://login.sina.com.cn/cgi/pin.php?r=694905&s=0&p=gz-52086a8a846fadcdacf4fb058324aa387858"#验证码地址 img=self.opener.open(url) f=open('vv.png','wb') f.write(img.read()) f.close() im=Image.open('vv.png') im.show()
拿到了验证码还不够,登录之前还有个预登录,拿到我们后面加密需要的参数
def prelogin(self): url="https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=NDc5MTkyNzQyJTQwcXEuY29t&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.19)&_=1529471491358" a=self.opener.open(url).read().decode('utf-8') a=a[a.find("(")+1:a.find(")")] b=json.loads(a) return b‘’‘sinaSSOController.preloginCallBack({ "retcode":0,"servertime":1540617565,"pcid":"gz-65c55b3534f8a1df1330b4708fb6d1b57273","nonce":"ONED4A","pubkey":"EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443","rsakv":"1330428213","is_openlock":0,"showpin":0,"exectime":10})’‘’
然后用这些参数对登录参数进行加密,说实在话具体的加密细节我也不记得了,当时做的时候拿了个记事本把所有东西记下来然后分析,也查了很多博客的资料才做好。
def GetMixUser(self,username,password): username_struct={ 'username':username} rsaPubkey=int(self.pre['pubkey'],16) key=rsa.PublicKey(rsaPubkey, 65537) message = str(self.pre['servertime']) + '\t' + self.pre['nonce'] + '\n' + password passwd = rsa.encrypt(message.encode('utf-8'), key) upass = binascii.b2a_hex(passwd) uname=base64.b64encode(parse.urlencode(username_struct).encode('utf-8')).decode('utf-8')[12:] mix={ 'uname':uname,'upass':upass.decode('utf-8')} return mix
拿到加密后的登录参数,可以提交了
def login(self,username,password,code): mix=self.GetMixUser(username,password) uname=mix['uname'] upass=mix['upass'] url="https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)" print("登录中……") postData={ "door":code, "encoding":"utf-8", "entry":"weibo", "from":"null", "gateway":1, "nonce":self.pre['nonce'], "prelt":72, "pwencode":"rsa2", "qrcode_flag":False, "returntype":"META", "savestate":7, "servertime":self.pre['servertime'], "service":"miniblog", "rsakv":self.pre['rsakv'], "su":uname, "sp":upass, "url":"https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack", "useticket":1, "vsnf":1 } postData=parse.urlencode(postData).encode('utf-8') result=self.opener.open(url,postData).read().decode('gbk') url1=result[result.find("replace")+9:result.find(')')-1] result=self.opener.open(url1).read().decode("gbk") if(result.find("身份")!=-1): return False result=result[result.find('location')+18:] url2=result[:result.find(')')-1] self.opener.open(url2).read().decode("gbk") return True
2.爬取信息
先得到用户follow的列表
def GetUserList(self,uid,pageNum): url="https://weibo.com/"+str(uid)+"/follow?page="+str(pageNum) try: result=self.opener.open(url).read().decode('utf-8') html = result.replace('\\n', '').replace('\\t', '').replace('\\r', '').replace('\\', '') html = html[html.find(" "):html.find(" ")] soup = BeautifulSoup(html, "html.parser") list_a = soup.findAll(name='div', attrs={ "class": "info_name W_fb W_f14"}) name = [] uid = [] for a in list_a: try: b = a.find(name="a") b = b['usercard'] b = b[3:13:] uid.append(b) name.append(a.text) print("加入用户:" + a.text) except: print("No Data") dic = { "name": name, "uid": uid} return dic except: pass
再拿到这些用户的主页微博言论,我们得到他们发的所有博文
def GetTalks(self,uid): rlist = [] i=0 html="" while(True): try: result=self.opener.open("https://weibo.com/u/"+str(uid)+"?page="+str(i)).read().decode("utf-8") html = result.replace("\\t", "").replace("\\n", "").replace("\\r", "").replace("\\", "") html = html[html.find("
3.数据存储
我暂时只做了这些,然后存储到数据库吧。mysql的基础语句很简单,我也就会这么几句,不会的百度吧。。
def sqllogin(self): db=pymysql.connect(host='localhost',user='root',db='weibouser',passwd='root',charset='utf8mb4') return db def sqlProcess(self,db): while(True): cursor=db.cursor() cursor.execute("SELECT * FROM USERS WHERE TAG =1") #1 表示 未处理 2 表示 正在处理 3 表示完成处理 result=cursor.fetchone() if(result): cursor.execute("UPDATE USERS SET TAG=2 WHERE USERID='%s'" % (result[2])) talks=self.GetTalks(uid=result[2]) for i in range(1,4): uids="" names="" userlist = self.GetUserList(uid=result[2], pageNum=i) try: uids=userlist['uid'] names=userlist['name'] except: break if int(result[4])!=3: for t in range(len(uids)): try: if self.IfExist(db,"users","name",names[t])==False: cursor.execute("INSERT INTO USERS (NAME,USERID,TAG,CLASS) VALUES ('%s','%s',%d,%d)" % (names[t], uids[t], 1, int(result[4])+1)) # 数据库写userlist cursor.execute("INSERT INTO FOLLOWS (USERID,FUID,FUNAME) VALUES ('%s','%s','%s')" % (result[2], uids[t],names[t])) except: print("Error") for talk in talks: try: cursor.execute("INSERT INTO USERTALKS (USERID,NAME,TALK)VALUES ('%s','%s','%s')" % (result[2],result[1],talk))#数据库写评论 except: print("Error") cursor.execute("UPDATE USERS SET TAG=3 WHERE USERID='%s'"%(result[2])) else: break def AnotherProcess(self,db): cursor=db.cursor(); cursor.execute("SELECT * FROM USERS WHERE 1"); results=cursor.fetchall() for result in results: sex="女" try: r = self.opener.open("https://weibo.com/u/"+result[2]).read().decode("utf-8") html = r.replace("\\t", "").replace("\\n", "").replace("\\r", "").replace("\\", "") if html.find("female") == -1: sex="男" except: pass soup = BeautifulSoup(html, "html.parser") keywords=soup.find(attrs={ "name":"keywords"})['content'] description=soup.find(attrs={ "name":"description"})['content'] cursor.execute("INSERT INTO USERDETAILS (NAME,DESCRIPTION,KEYWORDS,SEX)VALUES('{}','{}','{}','{}')".format(result[1],description,keywords,sex))
4.echart本地展示
import pymysqlfrom pyecharts import Graphclass DrawCharts: raw_nodes = [] links = [] def __init__(self): self.CreatRelation() self.DrawCharts() def CreatRelation(self): db=pymysql.connect(host='localhost',user='root',db='weibouser',passwd='root',charset='utf8mb4') cursor=db.cursor() cursor.execute("SELECT * FROM USERS WHERE 1"); users=cursor.fetchall() for user in users:#上海大学 cursor.execute("SELECT * FROM FOLLOWS WHERE USERID='%s'"%(user[2])); results=cursor.fetchall() if results: for result in results: links.append({ "source":user[1],"target":result[4]}) print(user[1]+"->"+result[4]) for i in range(3): cursor.execute("SELECT * FROM USERS WHERE CLASS=%d"%(i+1)) results=cursor.fetchall() for result in results: self.raw_nodes.append({ "name": result[1], "symbolSize": 30-i*13,"category":i}) db.close() def DrawCharts(self): graph = Graph("微博关注关系图", width=1200, height=600) categories=[ { "name":"一级深度", "itemStyle": { "normal":{ "color":'#CC0033', "borderColor": "#5182ab", "borderWidth": 1.8, "show":"True" } } }, { "name": "二级深度", "itemStyle": { "normal": { "color": '#f44242', "borderColor": "#5182ab", "borderWidth": 1.8, "show":"True" } } }, { "name": "三级深度", "itemStyle": { "normal": { "color": '#663366', "borderColor": "#5182ab", "borderWidth": 1.8, "show":"True" } } } ] graph.add("",self.raw_nodes,self.links,label_pos="right",graph_repulsion=10, is_legend_show=False,categories=categories, label_text_color=None,is_label_show=True) graph.render()
效果就不发了,运行起来比较卡,因为节点太多了,当时老师说怎么怎么加速,也不说细节,我当时一脸懵逼,也就没管了,大不了减少节点数量嘛。
5.web展示
前端两段关键代码。。
{ {Content_Title}}
{ {Search_Card_Title}}
{ {item.message}}
{ {Follow_Card_Title}}
{ {Words_Card_Title}}
后端关键代码
//用户的具体信息domain package com.example.demo;import org.hibernate.engine.loading.internal.CollectionLoadContext;import javax.persistence.*;@Entity@Table(name = "userdetails")public class UserDetails { @Id @GeneratedValue(strategy = GenerationType.AUTO) private int num; @Column(name="description") public String description; @Column(name="keywords") public String keywords; @Column(name="sex") public String sex; @Column(name="name") public String name; void setName(String name){ this.name=name; } void setDescription(String description){ this.description=description; } void setKeywords(String keywords){ this.keywords=keywords; } void setSex(String sex){ this.sex=sex; } String getName(){ return this.name; } String getDescription(){ return this.description; } String getKeywords(){ return this.keywords; } String getSex(){ return this.sex; }}
//最重要的controller package com.example.demo;import org.springframework.stereotype.Controller;import org.springframework.web.bind.annotation.CrossOrigin;import org.springframework.web.bind.annotation.RequestMapping;import org.springframework.web.bind.annotation.ResponseBody;import org.springframework.beans.factory.annotation.Autowired;import java.util.ArrayList;import java.util.List;@Controller@CrossOriginpublic class IndexController { @Autowired private UserRepositroy userRepositroy; @RequestMapping(value="users") @ResponseBody public Object GetUser(){ Listuserlist=userRepositroy.findAll(); return userlist; } @RequestMapping(value="links") @ResponseBody public Object GetLinks(String name){ List followsList =followsRepositroy.findByName(name); List links =new ArrayList (); for(int i=0;i followsList=followsRepositroy.findByName(name); List nodes =new ArrayList (); RetnNodes r=new RetnNodes(); r.setName(name); r.setCategory(1); r.setSymbolSize(30); nodes.add(r); for(int i=0;i userlist=userRepositroy.findByName(name); String word = userlist.get(0).getKeywords(); String value=userlist.get(0).getKeyvalue(); String[] array_word= word.split(","); String[] array_value= value.split(","); List words=new ArrayList (); for(int i=0;i<50;i++){ RetnWords w=new RetnWords(); w.setName(array_word[i]); w.setValue(Double.parseDouble(array_value[i])); words.add(w); } return words; } @Autowired private FollowsRepositroy followsRepositroy; @RequestMapping(value="follows") @ResponseBody public Object GetFollows(String name){ List followslist=followsRepositroy.findByName(name); return followslist; } @Autowired private DetailsRepositroy detailsRepositroy; @RequestMapping(value="details") @ResponseBody public Object GetDetails(String name){ UserDetails userDetails=detailsRepositroy.findOneByName(name); return userDetails; }}
其他的都是类似的。无限重复改名。