实时新闻采集器
天亮爬虫篇--中级篇
今日头条早些年完全是爬虫,现在是厂家来爬它
maven
WebSpiderMiddle4Job001();
<!-- 仓库源配置 -->
nexus aliyun
<!-- 依赖包 -->
<dependencies>
<!-- 项目打包与发布 -->
com.tianliangedu.job001.ui
UIManager();
com.tianliangedu.job001.schedule
TaskScheduleManager(); //负责任务调度
com.tianliangedu.job001.download
com.tianliangedu.job001.parser
com.tianliangedu.job001.persistence
com.tianliangedu.job001.utils
StaticValue(); //存放项目当中的静态变量
ReadConfigUtil(); //读取配置文件的工具类,既支持直接读取classpath下的,也支持读取外置配置文件
IOUtil();
WebPageDownloadUtil(); //用于下载给定任意网址后对应的html代码
WebCharsetDetectorUtil(); //拿charset
RegexUtil();
com.tianliangedu.job001.pojos 放不持久化类
UrlTaskPojo();
com.tianliangedu.job001.pojos.entity 与数据库一一对应起来
com.tianliangedu.job001.controller
seeds.txt 不放在resouces里面
resources
//git化管理
192.168.1.14
在gitlab,->your projects挑一个点进去-> groups ->挑一个->new project
->project name:WebSpiderMiddle4jonb001
Project desription(optional):
将源码xxx与gitlab项目进行正规的版本化操作
复制http
//git init
eclipse->右键项目->Team->Share Project->Git->
选中Use or create ... 取消里面的项目选中状态
点击Create Repository,来创建本地版本库->Finish
右键项目->Team->Commit...
勾选pom.xml
上方message写上:项目环境搭建与包初始化,项目的第一次提交
commit
右键项目->Team->Remote->Push
Source ref: xxx/master. (在本地默认是master)
Destination ref: xxx/dev_v1(存放的位置) Add. ->next->finish
自定义编辑author 注释方法
种子文件读取工具类:
public class UIManager{
public UrlTaskPojo getRootUrlByDirect(){
return UrlTaskPojo(StaticValue.rootTitle,StataicValue.rootUrl);
}
public UrlTaskPojo getRootUrlByStaticValue(){
return new UrlTaskPojo(StaticValue.rootTitle,StataicValue.rootUrl);
}
public List<UrlTaskPojo> getRootUrlBySeedFile(String dataFilePath,boolean isClassPath){
//这里改为封装类,里面至少包含title,url
List<String> lineList = IOUtil.readFileToList(dataFilePath,isClassPath,StaticValue.defaultEncoding);
List<UrlTaskPojo> resultTaskPojo = xx;
for(String line:lineList){
line = line.trim();
if(line.length() > 0 && !line.startsWith("#")){
String[] columnArray = line.split("\\s");
if(columnArray.length == 2){
UrlTaskPojo tempPojo = new UrlTaskPojo(columnArray[0].trim(),columnArray[1].trim());
resultTaskPojo.add(tempPojo);
}else{
sout.err();
throw new Exception("");
}
}
}
return resultTaskPojo;
}
public static void addSeedurlsToTaskSchedule(){
xxx
}
}
public class StaticValue{
public static String rootUrl = "http://xxx";
//默认读取文件的编码设置
defaultEncoding = "utf-8";
rootTitle = "";
//分隔符号静态设置
sep_next_ling = "\n";
}
//种子文件放在哪里?
//resource里面的话jar包不解它就会没有,这里分内置和外置
seeds.txt
#这是我的种子文件,请按格式添加
中国青年网-国内新闻 http://news.youth.cn/gn/
public class ReadConfigUtil{
psvm(){
//1.配置文件路径
String configPath = "seeds.txt";
//2.配置文件的读取模式,是classpath还是系统路径
boolean isClassPath = true;
//3.正式进行文件读取
InputStream is = ReadConfigUtil.class.getClassLoader().getResourceAsStream(configPath);
流读取
}
}
public class ReadConfigUtil{
public static List<String> readFileToList(String filePath,boolean isClassPath,String charset){
InputStream is = null;
if(isClassPath){
//3.正式进行文件读取
is = ReadConfigUtil.class.getClassLoader().getResourceAsStream(filePath);
}else{
is = new FileInputStream(filePath);
}
流读取
return lineList;
}
psvm() throws IOException{
//1.数据配置文件路径
String configPath = "seeds.txt";
//2.数据配置文件的读取模式,是classpath还是系统路径
boolean isClassPath = true;
//3.正式进行文件读取
List<String> lineList = readFileToList(configPath,isClassPath,StaticValue.encoding);
流读取
}
}
放在resources里面的话会打包里,一般不好修改,数据大部分是外置的,所以放在项目最外面即可,即非classPath
if(isClassPath){
//3.正式进行文件读取
is = ReadConfigUtil.class.getClassLoader().getResourceAsStream(filePath);
}else{
is = new FileInputStream(filePath);
}
public class UrlTaskPojo{
private String title;
private String url;
public UrlTaskPojo(){
}
public UrlTaskPojo(String title,String url){
xxxx
}
setter and getter
}
public class TaskScheduleManager{
public static LinkedList<UrlTaskPojo> todoTaskPojoList = new LinkedList
<UrlTaskPojo>(); //优先级高的从头部加,低的从尾部加
public static LinkedList<UrlTaskPojo> doneTaskPojoList = new LinkedList
<UrlTaskPojo>();
public static void addUrlTaskPojoList(List<UrlTaskPojo> todoTaskList){
todoTaskPojoList.addAll(todoTaskList);
}
public static void addOneUrlTaskPojoList(UrlTaskPojo todoTask){
todoTaskPojoList.add(todoTask);
}
public static void addDoneUrlTaskPojoList(UrlTaskPojo todoTask){
doneTaskPojoList.add(todoTask);
}
public static void removeUrlTaskPojoList(List<UrlTaskPojo> removeTaskList){
todoTaskPojoList.removeAll(removeTaskList);
}
public static void removeOneUrlTaskPojoList(UrlTaskPojo removeTaskList){
todoTaskPojoList.remove(removeTaskList);
}
public static UrlTaskPojo take(){
UrlTaskPojo taskPojo = todoTaskPojoList.pollFirst();
return taskPojo;
}
}
URL_Connection 下载html:
public class WebPageDownloadUtil{
psvm(){
url = ""
URL urlObj = new URL(url);
charset
URLConnection urlConnection = urlObj.openConnection();
HttpURLConnection xxx
InputStream is = urlConnection.getInputStream();
读取流
}
}
public class WebCharsetDetectorUtil{
psvm(){
//header寻找charset
urlConnection.getHeader.... map
//当header和meta冲突的时候以header为准。
//在header找不到一般在meta找
String findCharset = null;
header. key value
//如果header没找到,则启用meta寻找
if(findCharset == null){
is
while((line=br.readLine())!=null){
line = line.toLowerCase();
}
}
}
}
public class RegexUtil{
getMatchText(String input ,String regex,int groupIndex){
//String line = "xxx";
//String regex = "charset=[\"]*([\\s\\S]*?)[\">];
Patten
Matcher
if(matcher.find()){
return matcher.group(groupIndex); //根据括号来看的组
}
return findCharset==null?xx:xx;
}
}
//24集
多线程
com.tianliangedu.job001.ui
UIManager();
com.tianliangedu.job001.schedule
TaskScheduleManager(); //负责任务调度
DownLoadManager();
com.tianliangedu.job001.download
com.tianliangedu.job001.iface.parser
NewsItemParserInterface();
com.tianliangedu.job001.iface.download
DownloadInterface();
DownLoadRunnable(); 下载线程
com.tianliangedu.job001.parser
HtmlParserManager();
com.tianliangedu.job001.persistence
com.tianliangedu.job001.utils
StaticValue(); //存放项目当中的静态变量
ReadConfigUtil(); //读取配置文件的工具类,既支持直接读取classpath下的,也支持读取外置配置文件
IOUtil();
WebPageDownloadUtil(); //用于下载给定任意网址后对应的html代码
WebCharsetDetectorUtil(); //拿charset
RegexUtil();
ReadConfigUtil(); //读配置文件的工具类
SystemConfigParas(); //系统配置参数工具类
com.tianliangedu.job001.pojos 放不持久化类
UrlTaskPojo();
com.tianliangedu.job001.pojos.entity 与数据库一一对应起来
com.tianliangedu.job001.controller
resources
spider.properties
1.继承Thread
优点:简单,直接,可起动
缺点:java是单继承,影响本类的后续继承扩展性
2.实现Runnable接口
优点:实现接口,对后续继承扩展性无影响
缺点:不如Thread直接
3.线程的状态
新建->就绪->运行->阻塞(等待/挂起)->死亡
public class DownLoadRunnable implements Runnable{
//线程可以运行的标志变量
private boolean enableRunningFlag = true; //static对类而言一停全停一跑全跑
getter and setter
//线程运行的入口方法
@Override
public void run(){
while(enableRunningFlag){
UrlTaskPojo taskPojo = TaskscheduleManager.take();
if(taskPojo!=null){
String html;
if(html != null){
}else{
}
}else{
sout("none receive seed");
Thread.sleep(2);
}
sout();
}
}
}
ThreadGroup 使用示例:
ThreadGroup threadGroup = new ThreadGroup("spider_group");
int count = 5;
for(int i = 1 ; i <= count ; i ++){
DownloadRunnable oneRunnable = new DownloadRunnable("");
new Thread(threadGroup,oneRunnable,"thread_" + i).start();
}
Thread[] threadArray = new Thread[threadGroup.activeCount()];
for(int i=0;i<3;i++){//检测3次
Thread.sleep(3);
sout(threadGroup.activeCount());
threadGroup.enumerate(threadArray);
for(Thread t:threadArray){
sout();
}
}
集合方法管理线程组:
标志位在run里面结束
DownloadManager{
for(Runnable oneRun:runnableList){
DownloadRunnable tmepObj = (DownloadRunnable)oneRun;
tmpObj.setEnableRunningFlag(false);//管理线程结束
sout();
}
}
public class DownLoadManager{
//线程组初始化
public static ThreadGroup_tGroup = new ThreadGroup("下载线程组");
//线程组之Runnable管理的集合对象
public static List<DownLoadRunnable> runnableList = new ArrayList<>();
//开启多少个下载线程
public static void start(){
int consumerNumber = 3;
List<Runnable> runnableList = new ArrayList<Runnable>();
for(int i = 1 ; i <= consumerNumber ; i++){
DownLoadRunnable oneRunnable = new DownLoadRunnable("");
new Thread(tGroup,oneRunnable,"");
runnableList.add(oneRunnable);
}
}
//获取线程的状态信息-多少个活着的下载线程
public static int getActiveDownLoadThreads(){
return tGroup.activeCount();
}
//一共初始化了多少个线程
public static int getInitDownLoadThreads(){
return initConsumerNumber;
}
//停止掉所有线程
public static void stopAllThreads(){
for(DownLoadRunnable runnable:runnableList){
runnable.setEnableRunningFlag(false);
}
}
}
配置文件工具类:
spider.properties utf-8
# 爬虫的配置文件
#针对download设置的配置参数
init_consumer_number=3
#每次遇到空任务时候的睡眠时间,单位为秒
sleep_time_for_empty=2000
//调用
ReadConfigUtil
public class ReadConfigUtil{
//初始化javase自带的配置文件读取工具类
private Properties configObj = new Properties();
public ReadConfigUtil(String configFilePath){
//配置文件读取顺序,1:系统文件路径,2.classpath路径
File configFile = new File(configFilePath);
Reader reader = null;
InputStream is = null;
if(configFile.exists()){
is = new FileInputStream(configFile);
reader = new InputSteamreader(is);
configObj.load(fis);
}else{
is = ReadConfigUtil.class.getClassLoader....
reader = new ...;
configObj.load(is);
}finally{
is.close();
}
}
public String getValue(String key){
return configObj.getProperty(key);
}
psvm(){
String xxxpath = "spider.properties";
....
}
}
public class SystemConfigParas{
//初始化参数读取的工具类实例
public static ReadConfigUtil configUtil = null;
static{
configUtil = new ReadConfigUtil("spider.properties");
}
//集中读取download相关的参数
public static int init_consumer_number = ....getValue();
sleep_time_for_empty
}
public class HtmlParserManager{
psvm(){
String url = "";
String htmlSource = "";
//1.先拿到小范围的数据块
String blockRegex = "<div class=\"main_1\">([\\s\\S]*)<div class=\"main_r\"";
getMatchText(htmlSource,blockRegex,0);
//2.开始逐条拿匹配块
while(matcher.find()){
//先获取准确标题
String titleRegex = "";
String title = xxxx;
...
}
sout();
}
}
public class RegexUtil{
public static String getMatchText(String input ,String regex,int groupIndex){
Pattern
Matcher
if(matcher.find()){
return matcher.group(groupIndex);
}
return null;
}
}
title
postTimeString. //发布时间
sourceURL
insertDate //插入时间
postDateObj //发布时间另一种形式
public interface NewsItemParserInterface{
}
String getChildElementValue(Element element,int childIndex,ContentSelectType contype){
String value = null;
switch(contentType){
case OUTHER_HTML:
value = element.child(childIndex).outerHtml();
break;
case...
}
}
com.tianliangedu.job001.iface.persistence
public class DataPersistenceInterface{
//数据持久化接口类
private DataBaseUtil dataBaseUtil;
public DataPersistenceInterface(){
}
//批量保存
public boolean persist(List<NewsItemEntity> itemEntity);
//单条保存
public boolean persist(NewsItemEntity itemEntity);
}
DataPersist4MySqlImpl implements xxxInterface{
@Override
public boolean persist(NewsItemEntity itemEntity){
DataBaseUtil dbutil = new DataBaseUtil(driver,url,username,password);
xxxx
ps.executeUpdate();
}
}
//乱码解决:
show variables like 'character_set%'
mysql配置:
my-default.ini
default-character-set=utf8
init_connect='set names utf8'
my.ini
character-set-server=utf8
jdbc参数:
mybatis.proerties
//TODO
线程问题
//TODO
//监控日志管理系统
MonitorManager{
}