满天星
Fork me on GitHub

天亮爬虫学习笔记02

实时新闻采集器

天亮爬虫篇--中级篇

今日头条早些年完全是爬虫,现在是厂家来爬它

maven

WebSpiderMiddle4Job001();
<!-- 仓库源配置 -->
nexus aliyun
<!-- 依赖包 -->
<dependencies>
<!-- 项目打包与发布 -->

com.tianliangedu.job001.ui
    UIManager();
com.tianliangedu.job001.schedule
    TaskScheduleManager();  //负责任务调度
com.tianliangedu.job001.download
com.tianliangedu.job001.parser
com.tianliangedu.job001.persistence
com.tianliangedu.job001.utils
    StaticValue(); //存放项目当中的静态变量
    ReadConfigUtil();    //读取配置文件的工具类,既支持直接读取classpath下的,也支持读取外置配置文件
    IOUtil();
    WebPageDownloadUtil();  //用于下载给定任意网址后对应的html代码
    WebCharsetDetectorUtil();  //拿charset
    RegexUtil();
com.tianliangedu.job001.pojos  放不持久化类
    UrlTaskPojo();
com.tianliangedu.job001.pojos.entity    与数据库一一对应起来
com.tianliangedu.job001.controller

seeds.txt   不放在resouces里面
resources


//git化管理
192.168.1.14
在gitlab,->your projects挑一个点进去-> groups ->挑一个->new project
->project name:WebSpiderMiddle4jonb001
Project desription(optional):
将源码xxx与gitlab项目进行正规的版本化操作

复制http
//git init
eclipse->右键项目->Team->Share Project->Git->
选中Use or create ...    取消里面的项目选中状态
点击Create Repository,来创建本地版本库->Finish

右键项目->Team->Commit...
勾选pom.xml
上方message写上:项目环境搭建与包初始化,项目的第一次提交
commit

右键项目->Team->Remote->Push

Source ref:  xxx/master.   (在本地默认是master)
Destination ref: xxx/dev_v1(存放的位置)   Add. ->next->finish

自定义编辑author 注释方法
种子文件读取工具类:

public class UIManager{
    public UrlTaskPojo getRootUrlByDirect(){
        return UrlTaskPojo(StaticValue.rootTitle,StataicValue.rootUrl);
    }
    public UrlTaskPojo getRootUrlByStaticValue(){
        return new UrlTaskPojo(StaticValue.rootTitle,StataicValue.rootUrl);
    }
    public List<UrlTaskPojo> getRootUrlBySeedFile(String dataFilePath,boolean isClassPath){
        //这里改为封装类,里面至少包含title,url
        List<String> lineList = IOUtil.readFileToList(dataFilePath,isClassPath,StaticValue.defaultEncoding);

        List<UrlTaskPojo> resultTaskPojo = xx;
        for(String line:lineList){
            line = line.trim();
            if(line.length() > 0 && !line.startsWith("#")){
                String[] columnArray = line.split("\\s");
                if(columnArray.length == 2){
                    UrlTaskPojo tempPojo = new UrlTaskPojo(columnArray[0].trim(),columnArray[1].trim());
                    resultTaskPojo.add(tempPojo);
                }else{
                    sout.err();
                    throw new Exception("");
                }
            }
        }

        return resultTaskPojo;
    }

    public static void addSeedurlsToTaskSchedule(){
        xxx
    }
}

public class  StaticValue{
    public static String rootUrl = "http://xxx";
    //默认读取文件的编码设置
    defaultEncoding = "utf-8";
    rootTitle = "";
    //分隔符号静态设置
    sep_next_ling = "\n";
}

//种子文件放在哪里?
//resource里面的话jar包不解它就会没有,这里分内置和外置

seeds.txt
#这是我的种子文件,请按格式添加
中国青年网-国内新闻    http://news.youth.cn/gn/

public class ReadConfigUtil{
    psvm(){
        //1.配置文件路径
        String configPath = "seeds.txt";
        //2.配置文件的读取模式,是classpath还是系统路径
        boolean isClassPath = true;
        //3.正式进行文件读取
        InputStream is = ReadConfigUtil.class.getClassLoader().getResourceAsStream(configPath);
        流读取

    }

}

public class ReadConfigUtil{
    public static List<String> readFileToList(String filePath,boolean isClassPath,String charset){
        InputStream is = null;
        if(isClassPath){
            //3.正式进行文件读取
            is = ReadConfigUtil.class.getClassLoader().getResourceAsStream(filePath);
        }else{
            is = new FileInputStream(filePath);
        }
        流读取
        return lineList;
    }
    psvm() throws IOException{
        //1.数据配置文件路径
        String configPath = "seeds.txt";
        //2.数据配置文件的读取模式,是classpath还是系统路径
        boolean isClassPath = true;
        //3.正式进行文件读取
        List<String> lineList = readFileToList(configPath,isClassPath,StaticValue.encoding);
        流读取

    }

}

放在resources里面的话会打包里,一般不好修改,数据大部分是外置的,所以放在项目最外面即可,即非classPath
if(isClassPath){
    //3.正式进行文件读取
    is = ReadConfigUtil.class.getClassLoader().getResourceAsStream(filePath);
}else{
    is = new FileInputStream(filePath);
}

public class UrlTaskPojo{
    private String title;
    private String url;

    public UrlTaskPojo(){

    }
    public UrlTaskPojo(String title,String url){
        xxxx
    }

    setter and getter
}

public class TaskScheduleManager{
    public static LinkedList<UrlTaskPojo> todoTaskPojoList = new LinkedList
    <UrlTaskPojo>(); //优先级高的从头部加,低的从尾部加
    public static LinkedList<UrlTaskPojo> doneTaskPojoList = new LinkedList
    <UrlTaskPojo>(); 

    public static void addUrlTaskPojoList(List<UrlTaskPojo> todoTaskList){
        todoTaskPojoList.addAll(todoTaskList);
    }
    public static void addOneUrlTaskPojoList(UrlTaskPojo todoTask){
        todoTaskPojoList.add(todoTask);
    }

    public static void addDoneUrlTaskPojoList(UrlTaskPojo todoTask){
        doneTaskPojoList.add(todoTask);
    }

    public static void removeUrlTaskPojoList(List<UrlTaskPojo> removeTaskList){
        todoTaskPojoList.removeAll(removeTaskList);
    }

    public static void removeOneUrlTaskPojoList(UrlTaskPojo removeTaskList){
        todoTaskPojoList.remove(removeTaskList);
    }

    public static UrlTaskPojo take(){
        UrlTaskPojo taskPojo = todoTaskPojoList.pollFirst();
        return taskPojo;
    }

}


URL_Connection 下载html:

public class WebPageDownloadUtil{
    psvm(){
        url = ""
        URL urlObj = new URL(url);
        charset
        URLConnection urlConnection = urlObj.openConnection();
        HttpURLConnection xxx
        InputStream is = urlConnection.getInputStream();
        读取流


    }
}


public class WebCharsetDetectorUtil{
    psvm(){
        //header寻找charset
        urlConnection.getHeader....  map

        //当header和meta冲突的时候以header为准。
        //在header找不到一般在meta找

        String findCharset = null;
        header.  key value

        //如果header没找到,则启用meta寻找
        if(findCharset == null){
            is
            while((line=br.readLine())!=null){
                line = line.toLowerCase();
            }
        }
    }
}

public class RegexUtil{
    getMatchText(String input ,String regex,int groupIndex){
        //String line = "xxx";
        //String regex = "charset=[\"]*([\\s\\S]*?)[\">];
        Patten
        Matcher
        if(matcher.find()){
            return matcher.group(groupIndex);   //根据括号来看的组
        }
        return findCharset==null?xx:xx;
    }
}


//24集
多线程
com.tianliangedu.job001.ui
    UIManager();
com.tianliangedu.job001.schedule
    TaskScheduleManager();  //负责任务调度
    DownLoadManager();
com.tianliangedu.job001.download
com.tianliangedu.job001.iface.parser
    NewsItemParserInterface();
com.tianliangedu.job001.iface.download
    DownloadInterface();
    DownLoadRunnable(); 下载线程
com.tianliangedu.job001.parser
    HtmlParserManager();
com.tianliangedu.job001.persistence
com.tianliangedu.job001.utils
    StaticValue(); //存放项目当中的静态变量
    ReadConfigUtil();    //读取配置文件的工具类,既支持直接读取classpath下的,也支持读取外置配置文件
    IOUtil();
    WebPageDownloadUtil();  //用于下载给定任意网址后对应的html代码
    WebCharsetDetectorUtil();  //拿charset
    RegexUtil();
    ReadConfigUtil();     //读配置文件的工具类
    SystemConfigParas();    //系统配置参数工具类
com.tianliangedu.job001.pojos  放不持久化类
    UrlTaskPojo();
com.tianliangedu.job001.pojos.entity    与数据库一一对应起来
com.tianliangedu.job001.controller
resources
    spider.properties



1.继承Thread
    优点:简单,直接,可起动
    缺点:java是单继承,影响本类的后续继承扩展性
2.实现Runnable接口
    优点:实现接口,对后续继承扩展性无影响
    缺点:不如Thread直接
3.线程的状态
    新建->就绪->运行->阻塞(等待/挂起)->死亡

public class DownLoadRunnable implements Runnable{
    //线程可以运行的标志变量
    private boolean enableRunningFlag = true; //static对类而言一停全停一跑全跑

    getter and setter

    //线程运行的入口方法
    @Override
    public void run(){
        while(enableRunningFlag){
            UrlTaskPojo taskPojo = TaskscheduleManager.take();
            if(taskPojo!=null){
                String html;
                if(html != null){

                }else{

                }
            }else{
                sout("none receive seed");
                Thread.sleep(2);
            }
            sout();
        }
    }
}


ThreadGroup 使用示例:
ThreadGroup threadGroup  = new ThreadGroup("spider_group");

int count = 5;
for(int i = 1 ; i <= count ; i ++){
    DownloadRunnable oneRunnable = new DownloadRunnable("");
    new Thread(threadGroup,oneRunnable,"thread_" + i).start();

}

Thread[] threadArray = new Thread[threadGroup.activeCount()];
for(int i=0;i<3;i++){//检测3次
    Thread.sleep(3);
    sout(threadGroup.activeCount());
    threadGroup.enumerate(threadArray);
    for(Thread t:threadArray){
        sout();
    }
}


集合方法管理线程组:

标志位在run里面结束

DownloadManager{
    for(Runnable oneRun:runnableList){
        DownloadRunnable tmepObj = (DownloadRunnable)oneRun;
        tmpObj.setEnableRunningFlag(false);//管理线程结束
        sout();
    }
}


public class DownLoadManager{
    //线程组初始化
    public static ThreadGroup_tGroup = new ThreadGroup("下载线程组");
    //线程组之Runnable管理的集合对象
    public static List<DownLoadRunnable> runnableList = new ArrayList<>();
    //开启多少个下载线程
    public static void start(){
        int consumerNumber = 3;
        List<Runnable> runnableList = new ArrayList<Runnable>();
        for(int i = 1 ; i <= consumerNumber ; i++){
            DownLoadRunnable oneRunnable = new DownLoadRunnable("");
            new Thread(tGroup,oneRunnable,"");
            runnableList.add(oneRunnable);
        }
    }
    //获取线程的状态信息-多少个活着的下载线程
    public static int getActiveDownLoadThreads(){
        return tGroup.activeCount();
    }

    //一共初始化了多少个线程
    public static int getInitDownLoadThreads(){
        return initConsumerNumber;
    }
    //停止掉所有线程
    public static void stopAllThreads(){
        for(DownLoadRunnable runnable:runnableList){
            runnable.setEnableRunningFlag(false);
        }
    }

}


配置文件工具类:

spider.properties utf-8
# 爬虫的配置文件

#针对download设置的配置参数
init_consumer_number=3
#每次遇到空任务时候的睡眠时间,单位为秒
sleep_time_for_empty=2000

//调用
ReadConfigUtil

public class ReadConfigUtil{
    //初始化javase自带的配置文件读取工具类
    private Properties configObj = new Properties();
    public ReadConfigUtil(String configFilePath){
        //配置文件读取顺序,1:系统文件路径,2.classpath路径
        File configFile = new File(configFilePath);
        Reader reader = null;
        InputStream is = null;
        if(configFile.exists()){
            is = new FileInputStream(configFile);
            reader = new InputSteamreader(is);
            configObj.load(fis);
        }else{
            is = ReadConfigUtil.class.getClassLoader....
            reader = new ...;
            configObj.load(is);
        }finally{
            is.close();
        }
    }
    public String getValue(String key){
        return configObj.getProperty(key);
    }

    psvm(){
        String xxxpath = "spider.properties";
        ....
    }

}

public class SystemConfigParas{
    //初始化参数读取的工具类实例
    public static ReadConfigUtil configUtil = null;
    static{
        configUtil = new ReadConfigUtil("spider.properties");
    }

    //集中读取download相关的参数
    public static int init_consumer_number = ....getValue();
    sleep_time_for_empty

}

public class HtmlParserManager{
    psvm(){
        String url = "";
        String htmlSource = "";

        //1.先拿到小范围的数据块
        String blockRegex = "<div class=\"main_1\">([\\s\\S]*)<div class=\"main_r\"";

        getMatchText(htmlSource,blockRegex,0);
        //2.开始逐条拿匹配块 
        while(matcher.find()){
            //先获取准确标题
            String titleRegex = "";
            String title = xxxx;
            ...
        }

        sout();
    }

}

public class RegexUtil{
    public static String getMatchText(String input ,String regex,int groupIndex){
        Pattern
        Matcher
        if(matcher.find()){
            return matcher.group(groupIndex);
        }
        return null;
    }
}


title
postTimeString. //发布时间
sourceURL 
insertDate    //插入时间
postDateObj    //发布时间另一种形式

public interface NewsItemParserInterface{

}

String getChildElementValue(Element element,int childIndex,ContentSelectType contype){
    String value = null;
    switch(contentType){
    case OUTHER_HTML:
        value = element.child(childIndex).outerHtml();
        break;
    case...
    }
}


com.tianliangedu.job001.iface.persistence
public class DataPersistenceInterface{
    //数据持久化接口类
    private DataBaseUtil dataBaseUtil;
    public DataPersistenceInterface(){

    }

    //批量保存
    public boolean persist(List<NewsItemEntity> itemEntity);
    //单条保存
    public boolean persist(NewsItemEntity itemEntity);
}

DataPersist4MySqlImpl implements xxxInterface{
    @Override
    public boolean persist(NewsItemEntity itemEntity){
        DataBaseUtil dbutil = new DataBaseUtil(driver,url,username,password);
        xxxx
        ps.executeUpdate();
    }
}

//乱码解决:

show variables like 'character_set%'

mysql配置:
my-default.ini
default-character-set=utf8
init_connect='set names utf8'

my.ini
character-set-server=utf8

jdbc参数:
mybatis.proerties

//TODO
线程问题
//TODO


//监控日志管理系统
MonitorManager{

}
-------------本文结束期待您的评论-------------