JavaWeb阶段练习:360影视数据采集
一、idea创建Maven管理的JavaWeb项目
二、创建一个用来封装影片数据的JavaBean对象
public class FilmInfo {
private String name;
private String url;
private String poster;
private boolean isPaid;
private String year;
private String score;
private String star;
public FilmInfo(){}
public FilmInfo(String name, String url, String poster, boolean isPaid, String year, String score, String star) {
this.name = name;
this.url = url;
this.poster = poster;
this.isPaid = isPaid;
this.year = year;
this.score = score;
this.star = star;
}
public String getName() {
return name;
}
public String getUrl() {
return url;
}
public String getPoster() {
return poster;
}
public boolean isPaid() {
return isPaid;
}
public String getYear() {
return year;
}
public String getScore() {
return score;
}
public String getStar() {
return star;
}
public void setName(String name) {
this.name = name;
}
public void setUrl(String url) {
this.url = url;
}
public void setPoster(String poster) {
this.poster = poster;
}
public void setPaid(boolean paid) {
isPaid = paid;
}
public void setYear(String year) {
this.year = year;
}
public void setScore(String score) {
this.score = score;
}
public void setStar(String star) {
this.star = star;
}
@Override
public String toString() {
final StringBuffer sb = new StringBuffer("FilmInfo{");
sb.append("name='").append(name).append('\'');
sb.append(", url='").append(url).append('\'');
sb.append(", poster='").append(poster).append('\'');
sb.append(", isPaid=").append(isPaid);
sb.append(", year=").append(year);
sb.append(", score=").append(score);
sb.append(", star='").append(star).append('\'');
sb.append('}');
return sb.toString();
}
}
三、使用了 Gson
和 HttpUtils
的依赖
1. Gson的maven依赖
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.5</version>
</dependency>
2. HttpUtils是我手写的Http工具类,里面主要包含3个方法,get请求,post请求,还有采集数据的getData方法
public class HttpUtils {
public static String getHtmlCode(String url) {
HttpURLConnection connection = null;
InputStream is = null;
BufferedReader br = null;
StringBuilder sb = new StringBuilder();
try {
connection = (HttpURLConnection) new URL(url).openConnection();
is = connection.getInputStream();
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String line;
while ((line = br.readLine()) != null) {
sb.append(line);
sb.append("\n");
}
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}
public static String Get(String url) {
HttpURLConnection conn = null;
InputStream is = null;
BufferedReader br = null;
String line = null;
StringBuilder sb = new StringBuilder();
try {
//获取URL连接对象强转为HTTPURL连接对象
conn = (HttpURLConnection) new URL(url).openConnection();
//设置HTTP请求方式为GET
conn.setRequestMethod("GET");
//设置连接超时时间为3秒
conn.setConnectTimeout(3000);
//设置读取时间超时为5秒
conn.setReadTimeout(5000);
conn.setRequestProperty("Content-Type", "text/html;charset=utf-8");
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml,application/json;");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36");
//发送请求
conn.connect();
if (conn.getResponseCode() == 200) {
is = conn.getInputStream();
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
while ((line = br.readLine()) != null) {
sb.append(line + "\n");
}
} else {
return "ResponseCode is Error:" + conn.getResponseCode();
}
} catch (IOException e) {
e.printStackTrace();
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return sb.toString();
}
public static String Post(String url, String requestContent) {
System.out.println("请求的URL:"+url);
System.out.println("请求的参数:"+requestContent);
HttpURLConnection conn = null;
OutputStream os = null;
BufferedWriter bw = null;
InputStream is = null;
String line = null;
StringBuilder sb = new StringBuilder();
BufferedReader br = null;
try {
conn = (HttpURLConnection) new URL(url).openConnection();
conn.setRequestProperty("Connection","keep-alive");
conn.setRequestMethod("POST");
conn.setDoInput(true);
conn.setDoOutput(true);
conn.setConnectTimeout(3000);
conn.setReadTimeout(15000);
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded;charset=utf-8");
conn.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml,application/json,application/x-www-form-urlencoded");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36");
conn.getOutputStream().write(requestContent.getBytes("UTF-8"));
conn.getOutputStream().flush();
if (conn.getResponseCode()==200){
is = conn.getInputStream();
br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
while((line=br.readLine())!=null){
sb.append(line+"\n");
}
}else{
return "ResponseCode is Error:" + conn.getResponseCode();
}
} catch (IOException e) {
e.printStackTrace();
}finally {
if(br!=null){
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(bw!=null){
try {
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return sb.toString();
}
public static String getData(String url){
String sourceCode = Get(url);
String regex = "<li class=\"item\">\\s*<a class=\"js-tongjic\" href=\"(.*?)\" monitor-shortpv-c=\"(.*?)\">\\s*<div class=\"cover g-playicon\">\\s*<img src=\"(.*?)\">\\s*(<span class=\"pay\">(.*?)</span>)?\\s*<div class=\"mask-wrap\">\\s*<span class=\"hint\">(.*?)</span>\\s*(<span class=\"point\">(.*?)</span>)?\\s*</div>\\s*</div>\\s*<div class=\"detail\">\\s*<p class=\"title g-clear\">\\s*<span class=\"s1\">(.*?)</span>\\s*</p>\\s*<p class=\"star\">(.*?)</p>\\s*</div>\\s*</a>\\s*</li>";
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(sourceCode);
// System.out.println(m.groupCount());
Gson gson = new GsonBuilder().setPrettyPrinting().create();
ArrayList<FilmInfo> list = new ArrayList<FilmInfo>();
while(m.find()){
FilmInfo fi = new FilmInfo(m.group(9),"http://www.360kan.com/"+m.group(1),m.group(3),"付费".equals(m.group(5)),m.group(6),m.group(8),m.group(10));
list.add(fi);
}
return gson.toJson(list);
}
}
Java实现原生的get和post请求,我在之前的博客中发过了,不懂的朋友可以回看我之前的博客,我就不做解释了,这里解释一下getData方法,这个方法主要是使用get发送请求获取指定页面源码,再通过正则表达式拿到我们所需要的数据,使用gson封装为Json数据返回,没有什么技术含量
注意:为了取到详细信息,此处正则表达式比较长,我建议使用配置文件来读取这种较长的正则表达式,避免代码中语句过长,问题排查难度大的问题,我这里是做记录写博客,所以直接放到变量里
四、随便写个前端页面
1. JSP部分
<%@ page import="com.zhiyuan.utils.HttpUtils" %>
<%@ page import="com.google.gson.JsonArray" %>
<%@ page import="com.google.gson.JsonParser" %>
<%@ page import="com.zhiyuan.bean.FilmInfo" %>
<%@ page import="java.util.ArrayList" %>
<%@ page import="com.google.gson.Gson" %>
<%@ page import="com.google.gson.JsonElement" %>
<%@ page contentType="text/html;charset=UTF-8" language="java" pageEncoding="UTF-8" %>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>360Video</title>
<link rel="icon" href="https://www.360kan.com/favicon.ico">
<link rel="stylesheet" href="./css/style.css?v=<%=Math.random()%>">
<script src="./js/jquery-3.1.1.js"></script>
</head>
<body>
<div class="header">
<div class="nav">360影视采集</div>
</div>
<div class="container">
<ul class="list">
<%
String data = HttpUtils.getData("https://www.360kan.com/dianying/list.php?rank=rankhot&cat=all&area=all&act=all&year=all&pageno=2&from=dianying_list");
JsonArray jsonElements = new JsonParser().parse(data).getAsJsonArray();
ArrayList<FilmInfo> filmInfos = new ArrayList<>();
Gson gson = new Gson();
for (JsonElement filmInfo : jsonElements) {
filmInfos.add(gson.fromJson(filmInfo,FilmInfo.class));
}
for (FilmInfo filmInfo : filmInfos) {
%>
<li>
<div class="poster">
<img src="<%=filmInfo.getPoster()%>">
<div class="paid"><%=filmInfo.isPaid()?"付费":"免费"%></div>
<div class="info">
<div class="year"><%=filmInfo.getYear()%></div>
<div class="score"><%=filmInfo.getScore()!=null?filmInfo.getScore():""%></div>
</div>
</div>
<div class="detail">
<div class="name"><%=filmInfo.getName()%></div>
<div class="star"><%=filmInfo.getStar().length()>12?filmInfo.getStar().substring(0,12)+"..." : filmInfo.getStar()%></div>
</div>
</li>
<%
}
%>
</ul>
</div>
</body>
</html>
2. css部分
*{
margin: 0;
padding: 0;
box-sizing: border-box;
}
body,html{
background-color:#f2f2f2;
}
a{
text-decoration:none;
}
li{
list-style:none;
}
.header{
width: 100%;
height: 50px;
line-height: 50px;
text-align: center;
color:#228b22;
font-weight: bold;
background-color:#fff;
box-shadow:rgba(0, 0, 0, 0.2) 1px 1px 8px 1px;
}
.container{
width:1190px;
background-color:#fff;
margin:20px auto;
}
.list > li{
display: flex;
flex-direction:column;
align-items:center;
width: 155px;
height: 250px;
margin-bottom:15px;
}
.list{
display:flex;
flex-direction: row;
flex-wrap: wrap;
justify-content:space-between;
}
li > .poster{
width: 155px;
height: 212px;
border-radius:5px;
overflow: hidden;
position:relative;
}
.poster > img{
width: 100%;
height: 100%;
}
.poster .info{
position:absolute;
width:155px;
height: 38px;
line-height: 38px;
font-size: 12px;
color:#000;
bottom:0;
background-color:rgba(0,0,0,.12);
display: flex;
flex-direction: row;
justify-content:space-between;
padding:0 10px;
}
.poster .paid{
position: absolute;
left:5px;
top:5px;
width:40px;
height: 20px;
line-height: 20px;
font-size: 12px;
color:#fff;
background-color:goldenrod;
text-align:center;
border-radius:5px;
}
.info .score{
color:goldenrod;
}
.info .year{
color:#fff;
}
li > .detail{
width: 155px;
height: 38px;
display: flex;
flex-direction: column;
justify-content: center;
align-items:center;
cursor:pointer;
}
.detail > .name{
width:155px;
height: 14px;
line-height: 14px;
font-size: 14px;
color:#000;
padding-left:5px;
}
.detail > .star{
width: 155px;
height: 12px;
line-height: 12px;
font-size: 12px;
color:#666666;
margin-top:5px;
padding-left:5px;
}
五、运行起来看一下效果
这里我只是采集了1页数据,当然采集其他的数据不过是重复这个操作,没有什么技术含量,就不说了
原文作者:絷缘
作者邮箱:zhiyuanworkemail@163.com
原文地址:https://zhiyuandnc.github.io/360Video/
版权声明:本文为博主原创文章,转载请注明原文链接作者信息