博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
[web开发] Vue + spring boot + echart 微博爬虫展示平台
阅读量:4954 次
发布时间:2019-06-12

本文共 13679 字,大约阅读时间需要 45 分钟。

1.微博登录

2.爬取数据

3.mysql存储

4.pyechart本地展示

5.用vue搭建网站web展示

 

先放图:

1.微博登录

新浪微博的登录不是简单的post就能解决的,他的登录有加密,所以我们要使用一定的解密算法才能正常登录微博,得到我们想要的数据。

先不要慌,第一步当然是import我们需要的库

from urllib import request,parseimport http.cookiejarimport base64import jsonimport rsaimport binasciifrom PIL import Imagefrom bs4 import BeautifulSoupimport pymysql

要登录,那肯定要先拿到验证码

def GetCode(self):        url="https://login.sina.com.cn/cgi/pin.php?r=694905&s=0&p=gz-52086a8a846fadcdacf4fb058324aa387858"#验证码地址        img=self.opener.open(url)        f=open('vv.png','wb')        f.write(img.read())        f.close()        im=Image.open('vv.png')        im.show()

拿到了验证码还不够,登录之前还有个预登录,拿到我们后面加密需要的参数

def prelogin(self):        url="https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=NDc5MTkyNzQyJTQwcXEuY29t&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.19)&_=1529471491358"        a=self.opener.open(url).read().decode('utf-8')        a=a[a.find("(")+1:a.find(")")]        b=json.loads(a)        return b‘’‘sinaSSOController.preloginCallBack({
"retcode":0,"servertime":1540617565,"pcid":"gz-65c55b3534f8a1df1330b4708fb6d1b57273","nonce":"ONED4A","pubkey":"EB2A38568661887FA180BDDB5CABD5F21C7BFD59C090CB2D245A87AC253062882729293E5506350508E7F9AA3BB77F4333231490F915F6D63C55FE2F08A49B353F444AD3993CACC02DB784ABBB8E42A9B1BBFFFB38BE18D78E87A0E41B9B8F73A928EE0CCEE1F6739884B9777E4FE9E88A1BBE495927AC4A799B3181D6442443","rsakv":"1330428213","is_openlock":0,"showpin":0,"exectime":10})’‘’

然后用这些参数对登录参数进行加密,说实在话具体的加密细节我也不记得了,当时做的时候拿了个记事本把所有东西记下来然后分析,也查了很多博客的资料才做好。

def GetMixUser(self,username,password):        username_struct={
'username':username} rsaPubkey=int(self.pre['pubkey'],16) key=rsa.PublicKey(rsaPubkey, 65537) message = str(self.pre['servertime']) + '\t' + self.pre['nonce'] + '\n' + password passwd = rsa.encrypt(message.encode('utf-8'), key) upass = binascii.b2a_hex(passwd) uname=base64.b64encode(parse.urlencode(username_struct).encode('utf-8')).decode('utf-8')[12:] mix={
'uname':uname,'upass':upass.decode('utf-8')} return mix

拿到加密后的登录参数,可以提交了

def login(self,username,password,code):        mix=self.GetMixUser(username,password)        uname=mix['uname']        upass=mix['upass']        url="https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.19)"        print("登录中……")        postData={            "door":code,            "encoding":"utf-8",            "entry":"weibo",            "from":"null",            "gateway":1,            "nonce":self.pre['nonce'],            "prelt":72,            "pwencode":"rsa2",            "qrcode_flag":False,            "returntype":"META",            "savestate":7,            "servertime":self.pre['servertime'],            "service":"miniblog",            "rsakv":self.pre['rsakv'],            "su":uname,            "sp":upass,            "url":"https://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack",            "useticket":1,            "vsnf":1        }        postData=parse.urlencode(postData).encode('utf-8')        result=self.opener.open(url,postData).read().decode('gbk')        url1=result[result.find("replace")+9:result.find(')')-1]        result=self.opener.open(url1).read().decode("gbk")        if(result.find("身份")!=-1):            return False        result=result[result.find('location')+18:]        url2=result[:result.find(')')-1]        self.opener.open(url2).read().decode("gbk")        return True

2.爬取信息

先得到用户follow的列表

def GetUserList(self,uid,pageNum):        url="https://weibo.com/"+str(uid)+"/follow?page="+str(pageNum)        try:            result=self.opener.open(url).read().decode('utf-8')            html = result.replace('\\n', '').replace('\\t', '').replace('\\r', '').replace('\\', '')            html = html[html.find("
"):html.find("
")] soup = BeautifulSoup(html, "html.parser") list_a = soup.findAll(name='div', attrs={
"class": "info_name W_fb W_f14"}) name = [] uid = [] for a in list_a: try: b = a.find(name="a") b = b['usercard'] b = b[3:13:] uid.append(b) name.append(a.text) print("加入用户:" + a.text) except: print("No Data") dic = {
"name": name, "uid": uid} return dic except: pass

再拿到这些用户的主页微博言论,我们得到他们发的所有博文

def GetTalks(self,uid):        rlist = []        i=0        html=""        while(True):            try:                result=self.opener.open("https://weibo.com/u/"+str(uid)+"?page="+str(i)).read().decode("utf-8")                html = result.replace("\\t", "").replace("\\n", "").replace("\\r", "").replace("\\", "")                html = html[html.find("

3.数据存储

我暂时只做了这些,然后存储到数据库吧。mysql的基础语句很简单,我也就会这么几句,不会的百度吧。。

def sqllogin(self):        db=pymysql.connect(host='localhost',user='root',db='weibouser',passwd='root',charset='utf8mb4')        return db    def sqlProcess(self,db):        while(True):            cursor=db.cursor()            cursor.execute("SELECT * FROM USERS WHERE TAG =1")           #1 表示 未处理 2 表示 正在处理 3 表示完成处理            result=cursor.fetchone()            if(result):                cursor.execute("UPDATE USERS SET TAG=2 WHERE USERID='%s'" % (result[2]))                talks=self.GetTalks(uid=result[2])                for i in range(1,4):                   uids=""                   names=""                   userlist = self.GetUserList(uid=result[2], pageNum=i)                   try:                        uids=userlist['uid']                        names=userlist['name']                   except:                       break                   if int(result[4])!=3:                       for t in range(len(uids)):                           try:                               if self.IfExist(db,"users","name",names[t])==False:                                    cursor.execute("INSERT INTO USERS (NAME,USERID,TAG,CLASS) VALUES ('%s','%s',%d,%d)" % (names[t], uids[t], 1, int(result[4])+1))  # 数据库写userlist                                    cursor.execute("INSERT INTO FOLLOWS (USERID,FUID,FUNAME) VALUES ('%s','%s','%s')" % (result[2], uids[t],names[t]))                           except:                               print("Error")                for talk in talks:                    try:                        cursor.execute("INSERT INTO USERTALKS (USERID,NAME,TALK)VALUES ('%s','%s','%s')" % (result[2],result[1],talk))#数据库写评论                    except:                        print("Error")                cursor.execute("UPDATE USERS SET TAG=3 WHERE USERID='%s'"%(result[2]))            else:                break    def AnotherProcess(self,db):        cursor=db.cursor();        cursor.execute("SELECT * FROM USERS WHERE 1");        results=cursor.fetchall()        for result in results:            sex="女"            try:                r = self.opener.open("https://weibo.com/u/"+result[2]).read().decode("utf-8")                html = r.replace("\\t", "").replace("\\n", "").replace("\\r", "").replace("\\", "")                if html.find("female") == -1:                    sex="男"            except:                pass            soup = BeautifulSoup(html, "html.parser")            keywords=soup.find(attrs={
"name":"keywords"})['content'] description=soup.find(attrs={
"name":"description"})['content'] cursor.execute("INSERT INTO USERDETAILS (NAME,DESCRIPTION,KEYWORDS,SEX)VALUES('{}','{}','{}','{}')".format(result[1],description,keywords,sex))

4.echart本地展示

import pymysqlfrom pyecharts import Graphclass DrawCharts:    raw_nodes = []    links = []    def __init__(self):        self.CreatRelation()        self.DrawCharts()    def CreatRelation(self):        db=pymysql.connect(host='localhost',user='root',db='weibouser',passwd='root',charset='utf8mb4')        cursor=db.cursor()        cursor.execute("SELECT * FROM USERS WHERE 1");        users=cursor.fetchall()        for user in users:#上海大学            cursor.execute("SELECT * FROM FOLLOWS WHERE USERID='%s'"%(user[2]));            results=cursor.fetchall()            if results:                for result in results:                    links.append({
"source":user[1],"target":result[4]}) print(user[1]+"->"+result[4]) for i in range(3): cursor.execute("SELECT * FROM USERS WHERE CLASS=%d"%(i+1)) results=cursor.fetchall() for result in results: self.raw_nodes.append({
"name": result[1], "symbolSize": 30-i*13,"category":i}) db.close() def DrawCharts(self): graph = Graph("微博关注关系图", width=1200, height=600) categories=[ { "name":"一级深度", "itemStyle": { "normal":{ "color":'#CC0033', "borderColor": "#5182ab", "borderWidth": 1.8, "show":"True" } } }, { "name": "二级深度", "itemStyle": { "normal": { "color": '#f44242', "borderColor": "#5182ab", "borderWidth": 1.8, "show":"True" } } }, { "name": "三级深度", "itemStyle": { "normal": { "color": '#663366', "borderColor": "#5182ab", "borderWidth": 1.8, "show":"True" } } } ] graph.add("",self.raw_nodes,self.links,label_pos="right",graph_repulsion=10, is_legend_show=False,categories=categories, label_text_color=None,is_label_show=True) graph.render()

效果就不发了,运行起来比较卡,因为节点太多了,当时老师说怎么怎么加速,也不说细节,我当时一脸懵逼,也就没管了,大不了减少节点数量嘛。

5.web展示

前端两段关键代码。。

 

后端关键代码

//用户的具体信息domain package com.example.demo;import org.hibernate.engine.loading.internal.CollectionLoadContext;import javax.persistence.*;@Entity@Table(name = "userdetails")public class UserDetails {    @Id    @GeneratedValue(strategy = GenerationType.AUTO)    private int num;    @Column(name="description")    public String description;    @Column(name="keywords")    public String keywords;    @Column(name="sex")    public String sex;    @Column(name="name")    public String name;    void setName(String name){        this.name=name;    }    void setDescription(String description){        this.description=description;    }    void setKeywords(String keywords){        this.keywords=keywords;    }    void setSex(String sex){        this.sex=sex;    }    String getName(){        return this.name;    }    String getDescription(){        return this.description;    }    String getKeywords(){        return this.keywords;    }    String getSex(){        return this.sex;    }}
//最重要的controller package com.example.demo;import org.springframework.stereotype.Controller;import org.springframework.web.bind.annotation.CrossOrigin;import org.springframework.web.bind.annotation.RequestMapping;import org.springframework.web.bind.annotation.ResponseBody;import org.springframework.beans.factory.annotation.Autowired;import java.util.ArrayList;import java.util.List;@Controller@CrossOriginpublic class IndexController {    @Autowired    private UserRepositroy userRepositroy;    @RequestMapping(value="users")    @ResponseBody    public Object GetUser(){       List
userlist=userRepositroy.findAll(); return userlist; } @RequestMapping(value="links") @ResponseBody public Object GetLinks(String name){ List
followsList =followsRepositroy.findByName(name); List
links =new ArrayList
(); for(int i=0;i
followsList=followsRepositroy.findByName(name); List
nodes =new ArrayList
(); RetnNodes r=new RetnNodes(); r.setName(name); r.setCategory(1); r.setSymbolSize(30); nodes.add(r); for(int i=0;i
userlist=userRepositroy.findByName(name); String word = userlist.get(0).getKeywords(); String value=userlist.get(0).getKeyvalue(); String[] array_word= word.split(","); String[] array_value= value.split(","); List
words=new ArrayList
(); for(int i=0;i<50;i++){ RetnWords w=new RetnWords(); w.setName(array_word[i]); w.setValue(Double.parseDouble(array_value[i])); words.add(w); } return words; } @Autowired private FollowsRepositroy followsRepositroy; @RequestMapping(value="follows") @ResponseBody public Object GetFollows(String name){ List
followslist=followsRepositroy.findByName(name); return followslist; } @Autowired private DetailsRepositroy detailsRepositroy; @RequestMapping(value="details") @ResponseBody public Object GetDetails(String name){ UserDetails userDetails=detailsRepositroy.findOneByName(name); return userDetails; }}

其他的都是类似的。无限重复改名。

 

转载于:https://www.cnblogs.com/aoru45/p/9860946.html

你可能感兴趣的文章
丶制作一个数字猜猜看小游戏
查看>>
秋季学期学习总结
查看>>
SpringBoot 优化内嵌的Tomcat
查看>>
phpstudy2018搭建网站,访问目录出现Forbidden You don't have permission to access / on this server...
查看>>
Visual Studio 2017再现C语言经典例题(一)
查看>>
HDU4045-第二类斯特林数
查看>>
Dagger2 入门解析
查看>>
JS——indexOf replace search
查看>>
关于android studio安装过程中的问题
查看>>
mysql 函数学习(常用) 及 用户管理
查看>>
sigmod函数求导
查看>>
Linux学习笔记--基础命令
查看>>
PHP+MySQL+Zend+phhMyAdmin教程
查看>>
记Tomcat进程stop卡住问题定位处理
查看>>
c++ 链接mysql:error LNK2019: 无法解析的外部符号
查看>>
js-面试题整理
查看>>
thinkphp命名空间
查看>>
数组课堂作业
查看>>
【POJ 1026】Cipher(置换群)
查看>>
职场有影帝出没,屌丝们请当心!
查看>>