--如果数据库已存在就删除 -- cascade 表示级联删除,即同时删除依赖于该数据库的所有对象,如表、视图、函数等。这也是一个可选的语法,但建议在需要完全清除数据库时使用。 drop database if exists db_momo cascade; --创建数据库 create database db_momo; --切换数据库 use db_momo; --列举数据库 show databases;
需求4:将ETL以后的结果保存到一张新的Hive表中 Create table …… as select ……
ETL实现
1 2 3 4 5 6 7 8 9
--如果表已存在就删除 droptable if exists tb_msg_etl; --将Select语句的结果保存到新表中 createtable tb_msg_etl as select*,substr(msg_time,0,10) as dayinfo, substr(msg_time,12,2) as hourinfo, --获取天和小时 split(sender_gps,",")[0] as sender_lng, split(sender_gps,",")[1] as sender_lat --提取经度纬度 from tb_msg_source --过滤字段为空的数据 where length(sender_gps) >0;
查看结果
1
select msg_time,dayinfo,hourinfo,sender_gps,sender_lng,sender_lat from tb_msg_etl limit 10;
--保存结果表 createtable if notexists tb_rs_total_msg_cnt comment "今日消息总量" as select dayinfo,count(*) as total_msg_cnt from tb_msg_etl groupby dayinfo;
select*from tb_rs_total_msg_cnt;
统计每小时消息量、发送和接收用户数
1 2 3 4 5 6 7
--保存结果表 createtable if notexists tb_rs_hour_msg_cnt comment "每小时消息量趋势" as select dayinfo,hourinfo,count(*) as total_msg_cnt,count(distinct sender_account) as sender_usr_cnt,count(distinct receiver_account) as receiver_usr_cnt from tb_msg_etl groupby dayinfo,hourinfo;
select*from tb_rs_hour_msg_cnt;
统计今日各地区发送消息总量
1 2 3 4 5 6 7
--保存结果表 createtable if notexists tb_rs_loc_cnt comment "今日各地区发送消息总量" as select dayinfo,sender_gps,cast(sender_lng asdouble) as longitude,cast(sender_lat asdouble) as latitude,count(*) as total_msg_cnt from tb_msg_etl groupby dayinfo,sender_gps,sender_lng,sender_lat;
select*from tb_rs_loc_cnt;
统计今日发送和接收用户人数
1 2 3 4 5 6 7
--保存结果表 createtable if notexists tb_rs_usr_cnt comment "今日发送消息人数、接受消息人数" as select dayinfo,count(distinct sender_account) as sender_usr_cnt,count(distinct receiver_account) as receiver_usr_cnt from tb_msg_etl groupby dayinfo;
select*from tb_rs_usr_cnt;
统计发送消息条数最多的Top10用户
1 2 3 4 5 6 7
--保存结果表 createtable if notexists tb_rs_susr_top10 comment "发送消息条数最多的Top10用户" as select dayinfo,sender_name as username,count(*) as sender_msg_cnt from tb_msg_etl groupby dayinfo,sender_name orderby sender_msg_cnt desc limit 10;
select*from tb_rs_susr_top10;
统计接收消息条数最多的Top10用户
1 2 3 4 5 6 7
--保存结果表 createtable if notexists tb_rs_rusr_top10 comment "接受消息条数最多的Top10用户" as select dayinfo,receiver_name as username,count(*) as receiver_msg_cnt from tb_msg_etl groupby dayinfo,receiver_name orderby receiver_msg_cnt desc limit 10;
select*from tb_rs_rusr_top10;
统计发送人的手机型号分布情况
1 2 3 4 5 6 7
--保存结果表 createtable if notexists tb_rs_sender_phone comment "发送人的手机型号分布" as select dayinfo,sender_phonetype,count(distinct sender_account) as cnt from tb_msg_etl groupby dayinfo,sender_phonetype;
select*from tb_rs_sender_phone limit 10;
统计发送人的操作系统分布
1 2 3 4 5 6 7
--保存结果表 createtable if notexists tb_rs_sender_os comment "发送人的OS分布" as select dayinfo,sender_os,count(distinct sender_account) as cnt from tb_msg_etl groupby dayinfo,sender_os;