大数据采集学习笔记01-实时热点

APP热点标签分析

需求分析
    给定一批app名称及其描述信息，共52.9万条数据
    其内容结构为，共包含6个字段，分别为（appId,app名称，一级分类，二级分类，三级分类，Tags描述信息），但并不一定完全规整，视实际情况可能做对齐包括4个或5个或6个字段。
    通过大数据开发之hive数据仓库命令行形式，完成数据加载，udf/udaf/udtf函数，统计分析的任务，并演示项目效果即可。
主要思路
    通过hive命令将数据加到数据仓库中
    使用hql+udf/udaf/udtf完成统计分析
    将统计分析结果插入到hive中自建的新表中
主要考点
    hive及hiveSQL常用命令
    系统函数+udf/udaf/udtf灵活使用
    hive常见问题的解决

技术组成：hive sql+udf/udaf/udtf
步骤拆解：
    1）输入，输出表设计到位            （1-0.4h）
    2）将数据加载到输入表中            （1-04h）
    3）hivesql+udf/udaf/udtf实现热词统计与写入库表    （1-0.5h）
开发细节
    4.0 prepare
        1）相关目录创建
            config:存放相关配置变量
            create:存放表结构数据
            deal:具体的sql脚本
            udf:udf/udaf/udtf相关的jar包
    4.1 按步骤执行
5.bug修复，调优
6.上线
    6.1项目部署
    6.2上线

//输入表
//怎么区分内表还是外表，外部引入的一般都是外表
//规定一下分割，存储格式
create external table app_tag_meta_info(
id string,
name string,
first_classify string,
second_classify string,
third_classify string,
tags string
)
#建成分区表
partitioned by (dt string comment 'update date')
row format delimited fields terminated by '\t' lines terminated by 
'\n'
stored as textfile;

touch config.sh 或者 touch set_env.sh

#! /bin/bash
HIVE='/usr/bin/hive'

#source set_env.sh            sh set_env.sh 不是在本窗口添加环境变量的，而是开了新窗口添加的环境变量
//改权限
chown -R hive 文件夹名.   -R为递归赋予hive权限

[...config]#cd ../create/ 
touch app_tag_meta_info.sh
#! /bin/bash

source ../config/set_env.sh

db="job_002"
table_name="app_tag_meta_info"

$HIVE -e "
    use $db;
    create external table $table_name(
        id string,
        name string,
        first_classify string,
        second_classify string,
        third_classify string,
        tags string
    )
    partitioned by (dt string comment 'update date')
    row format delimited fields terminated by '\t' lines terminated by 
    '\n'
    stored as textfile;
"

# sh app_tag_meta_info
use job_002
show tables
desc app_tag_meta_info

如果要修改.sh文件的话，不能直接修改，要删除已有的sh出来的表再重新建立
drop table app_tag_meta_info;
sh ....

//输出表
create table hot_tag_rank(
    tag string,
    freq int,
)
partitioned by (dt string comment 'update date')
row format delimited fields terminated by '\t' lines terminated by 
    '\n'
stored as textfile;

sh app_tag_meta_info


//将数据加载到数据表中
[create] # ../deal
rz -y
touch produce_app_tag_meta_info.sh

#! /bin/bash

source ../config/set_env.sh

updateDT=$1        //$1 即第一个传入的参数 一般放在最前面
db="job_002"
table_name = "app_tag_meta_info"
jar_path=""
class_path=""
data_source_path="app_abstract_info.txt"

$HIVE -e "
    use $db;
    load data local inpath '$data_source_path' overwrite into table $table_name partition(dt='$updateDT')
"

sh xxx.sh

select * from app_tag_meta_info limit 10;
show partitions app_tag_meta_info        //查看分区

//setp1:找到tag字段
select tags from app_tag_meta_info limit 10;
//修改上句，改为集合
select split(tags,',') from app_tag_meta_info limit 10;
//再把上句改为字符串
//setp2:拆分字段
select explode(split(tags,',')) from app_tag_meta_info limit 10; (不采用)
//step3:用lateral view包装
select tag from app_tag_meta_info laterval view explode(split(tags,',')) tag_table as tag limit 10;  (采用)(即不使用 as tag 这种形式，避免了后面的where 无法使用tag的形式)
//step4:bug解决和优化（去空格）
select tag from app_tag_meta_info laterval view explode(split(tags,',')) tag_table as tag where tag !='' limit 10;
//按频次倒排
select tag,count(1) as freq from app_tag_meta_info laterval view explode(split(tags,',')) tag_table as tag where tag !='' and tag != '-' group by tag order by freq limit 10;        //这里select先执行order by后执行，所以可以看到前面的函数值

set hive.execution.engine=tez;

//将结果集写入到表中    分区表用overwrite写入
insert overwrite table hot_tag_rank partition(dt='20180507')
select tag,count(1) as freq from app_tag_meta_info laterval view explode(split(tags,',')) tag_table as tag where tag !='' and tag != '-' and dt='20180507' group by tag order by freq desc;

//封装成sh
#! /bin/bash

source ../config/set_env.sh

updateDT=$1
db="job_002"
table_name = "hot_tag_rank"
jar_path=""
class_path=""

$HIVE -e "
    use $db;
    set hive.execution.engine=tez;  //这里是设置引擎
    insert overwrite table $table_name partition(dt=$updateDT')
    select tag,count(1) as freq from app_tag_meta_info laterval view explode(split(tags,',')) tag_table as tag where tag !='' and tag != '-' and dt='$updateDT' group by tag order by freqd desc;
"

//改bug
sh xxxx.sh 20180507
//写入口
[deal]# touch a_main.sh
#! /bin/bash

#得到当前日期
currentDT='date+%Y%m%d'
echo "currentDT="$currentDT

#将文本文件数据加载到app_tag_meta_info表中
echo "start load data to table process"
sh produce_app_tag_meta_info.sh $currentDT
echo "end"

#生成统计排序的热度标签数据
echo "start insert tag rank data"
sh produce_hot_tag_rank.sh $currentDT
echo "end"

echo "all done!"


//next
cp hot_tag_rank.sh hot_tag_rank_rcfile.sh

#! /bin/bash

db="job_002"
table_name="hot_tag_rank_rcfile"

$HIVE -e "
    use $db;
    create table $table_name(
        tag string,
        freq int
    )
partitioned by (dt string comment 'update date')
STORED AS rcfile;  //面向列分组
"

# desc hot_tag_rank_rcfile
# show create table hot_tag_rank_rcfile

[deal]# cp produce_hot_tag_rank.sh produce_hot_tag_rank_rc.sh

#! /bin/bash

source ../config/set_env.sh

updateDT=$1
db="job_002"
output_table_name_1 = "hot_tag_rank_rcfile"
input_table_name_1 = "app_tag_meta_info"
jar_path=""
class_path=""

$HIVE -e "
    use $db;
    set hive.execution.engine=tez;  //这里是设置引擎
    insert overwrite table $output_table_name_1 partition(dt=$updateDT')
    select tag,count(1) as freq from $input_table_name_1 laterval view explode(split(tags,',')) tag_table as tag where tag !='' and tag != '-' and dt='$updateDT' group by tag order by freqd desc;
"

vi a_main.sh

#生成统计排序的热度标签数据
echo "start insert tag rank data"
sh produce_hot_tag_rank_rc.sh $currentDT
echo "end"

//查看变化
//查找rcfile路径
[slave]$ hdfs dfs -ls 路径
出来路径之后在后面输入dt=20180507
出现/dt=20180507/000000_0
//再来一个不是rcfile的路径
再在路径后面输入dt=20180507
也是得到000000_0
//查看里面的内容
# hdfs dfs -text 空间大小 时间 /xxxx/xxx/dt=20180507/* | more

rcfile自带压缩。  所以rcfile空间大小要低一些
hive --service rcfilecat /xxxxx/00000_0   //查看里面的内容,但是里面会乱码
一般用select查看