一、HIVE基础

1.1 建表与插入数据

建表

-- 新建Hive表: --

drop table if exists dev.table;

create table if not exists dev.dtable(

datag_time string comment '字段名称',

group_type char(10) comment '字段名称',

rationality_index double comment '字段名称',

deep_value_index double comment '字段名称'

)

comment '表名称注释'

partitioned by (

dt string comment 'partition : date'

)

row format delimited

fields terminated by '\001'

tblproperties (

'author'='tian'

);

插入数据

--插入数据

set hive.exec.dynamic.partition=true;

set hive.exec.dynamic.partition.mode=nonstrict;

INSERT INTO TABLE app.* PARTITION (dt)

select

distinct *, *,*,*,*,dt

from *.*

where name is not NULL AND name like '%桌%' AND dt='2023-02-25'

LIMIT 2000000;

本地数据插入HIVE表:

-- 新建Hive表:留资并下单客户 --

drop table if exists app.*;

create table if not exists app.*(

touch_cus_time string comment '客户触达时间',

customer_no string comment '客户C码',

)

comment '新建Hive表:留资并下单客户'

partitioned by (

dt string comment 'partition : date'

)

row format delimited

fields terminated by ','

tblproperties (

'author'=''

);

LOAD DATA LOCAL INPATH '本地路径.csv' OVERWRITE INTO TABLE app.app_xxxx PARTITION(dt='2023-05-23');

#建立表时,去掉EXCEL表头,然后用\t来进行分隔

1.2查询

统计表的行数:

select count(*) from 表名 where dt='';

查询系统日期:

SELECT sysdate(-2); --(-2)表示系统的前一天

1.3 做差集

1.4连接语句

两个表做JOIN

SELECT

*

FROM

表名 as ta

INNER JOIN 表名 as tb

ON ta.dt = tb.dt and ta.id = tb.id

WHERE ta.dt>='2022-01-01'

1.5 排序

分区排序,全局排序, MapReduce中的内部排序

https://blog.csdn.net/qq_43192537/article/details/102293995

按照某一字段排序:

SELECT date, product, revenue

FROM sales

ORDER BY revenue DESC;

1.6 修改表字段类型

https://blog.csdn.net/glittledream/article/details/84789571

Alter table 表名 change column 原字段名称 现字段名称 数据类型

1.7 update

https://my.oschina.net/u/2380815/blog/4453765

Hive0.14版本之前是不支持update和delete操作的,之后的Hive数据表必须要满足一定的条件,比如ORC存储、ACID支持等,才可以进行update和delete操作,本篇文章讲一下传统的hive数据表如果通过写SQL的方式实现数据的更新。

1.8 处理空值

https://blog.csdn.net/weixin_30416497/article/details/97950750

WHERE 字段名 IS NULL

1.9 where

不等值的判断

SELECT item_name,brand_name_full

FROM 表名

WHERE dt>="2021-05-25" and brand_name_full<>"NO BRAND"

LIMIT 3000

1.10 去重/模糊查询

(1) 去除重复项,模糊查询

select

distinct sku_name

from 表名

where sku_name is not NULL AND sku_name like '%桌%' AND dt='2023-02-25';

(2) 去重后统计数量

select count(distinct AccountID) from CharacterLogin where day="27" and month="10";

(3) 对某一列去重 distinct 只能加到最前面,同时存在多个列时,不能对单列去重复,所以可以使用row_number()等,具体如下: 对sku_name去重复

SELECT dropre.sku_name,

dropre.m1,

dropre.m2,

dropre.m3,

dropre.mm

FROM

(SELECT aa.sku_name,aa.

m1,

aa.m2,

aa.m3,aa.mm,row_number()

over (partition by aa.sku_name order BY aa.mm) as rn

FROM app.app_*** AS aa )

AS dropre

WHERE dropre.rn =1

https://www.cnblogs.com/rrttp/p/9026359.html

1.11 保留第一个数字

1.11 两个表的联合查询

https://www.cnblogs.com/likai198981/archive/2013/03/29/2989740.html

1.12 统计某个字段出现的次数

https://blog.csdn.net/love_java_cc/article/details/52234889

1.13 执行sql脚本

hive -f t.sql

1.14 数据导出为txt

bin/hive -f sql.q >> res.csv

1.15 拼接一个字符串等

CONCAT(dt,"-01")

concat(col1,col2)

#GROUP BY 之后通过拼接存储

concat_ws('_',collect_set(name))

1.16 加工表

#!/bin/bash

start_date="2023-01-01"

end_date="20230501"

# for time_date in '2023-05-01' '2023-04-01' '2023-03-01' '2023-02-01' '2023-01-01' '2022-12-01' '2022-11-01' '2022-10-01'

# '2022-09-01' '2022-08-01' '2022-07-01' '2022-06-01' '2022-05-01' '2022-04-01' '2022-03-01' '2022-02-01' '2022-01-01'

# do

function run_write(){

time_date=$1

echo $time_date

set_dt_d=$time_date

b_threem=$(date -d "${time_date}-3months" +%Y-%m-01)

b_onem=$(date -d "${time_date}-1months" +%Y-%m-01)

thism=$(date -d "${time_date}+1months" +%Y-%m-01)

yy_ms=$(date -d "${time_date}-12months" +%Y-%m-01)

yy_me=$(date -d "${time_date}-11months" +%Y-%m-01)

yy_nextms=$(date -d "${time_date}-11months" +%Y-%m-01)

yy_nextme=$(date -d "${time_date}-9months" +%Y-%m-01)

retain_s=$(date -d "${yy_ms}" +%Y-01-01)

retain_e=$(date -d "${retain_s}+12months" +%Y-%m-01)

ty_his=$(date -d "${time_date}" +%Y-01-01)

hive -e "

set hive.exec.dynamic.partition=true; \

set hive.exec.dynamic.partition.mode=nonstrict; \

INSERT INTO TABLE app.写入表 PARTITION (dt) \

SELECT \

customer_no, \

SUM(IF(dt>='$b_threem' AND dt<'$set_dt_d',total_amount,0)) b_threem_amount, \

SUM(IF(dt>='$b_onem' AND dt<'$set_dt_d',total_amount,0)) b_onem_amount, \

SUM(IF(dt>='$set_dt_d' AND dt<'$thism',total_amount,0 )) thism_amount, \

SUM(IF(dt>='$yy_ms' AND dt<'$yy_me',total_amount,0)) yy_m_amount, \

SUM(IF(dt>='$yy_nextms' AND dt<'$yy_nextme',total_amount, 0)) yy_nextm_amount, \

SUM(IF(dt>='$retain_s' AND dt<'$retain_e',total_amount, 0)) retain_amount, \

SUM(IF(dt>='$ty_his' AND dt<'$set_dt_d',total_amount ,0 )) ty_his_amount, \

'$set_dt_d' AS dt \

FROM \

( \

SELECT \

id AS cust_no, \

SUM(total_amount) AS amount, \

CONCAT(dt,'-01') AS dt \

FROM \

app.读表数据 \

GROUP BY id,dt \

) amount_data \

GROUP BY customer_no

"

}

# done

while [[ "$start_date" -le "$end_date" ]] ;

do

stat_date_month=`date -d "${start_date}" +%Y-%m-%d`

echo $stat_date_month

run_write $stat_date_month

start_date=$(date -d "${start_date}+1months" +%Y%m%d)

done

1.17 RANK

rank() over(partition BY x order by x, x DESC) AS priority

二、HIVE和ES数据相互导入

HIVE表插入数据

set hive.exec.dynamic.partition=true;

set hive.exec.dynamic.partition.mode=nonstrict;

INSERT INTO TABLE app.app_*_* PARTITION (dt)

select

distinct sku_name, item_first*,item_second_*,item_third_*,jd_*,dt

from xxx.xxx_*

where sku_name is not NULL AND dt='2023-02-25';

添加jar文件

--添加jar文件

add jar hdfs://xxxx/xxx/xx/elasticsearch-hadoop-hive-8.6.2.jar;

基于HIVE建立ES链接

--基于HIVE,建立ES连接

DROP TABLE IF EXISTS xxx.xxx_hive_es;

CREATE EXTERNAL TABLE xxx.xxx_hive_es(

sku_name string,

xx string,

xx string,

xx string,

prc float

)STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'

TBLPROPERTIES('es.resource'='es名称',

'es.nodes'='esxxx.xxx-xxx-hb.xxx.com',

'es.port'='xxx',

'es.mapping.name' = 'sku_xxx:sku_xxx, item_first_cate_xxx:item_first_cate_xxx',

'es.mapping.id' = 'sku_xxx',

'es.write.operation'='upsert',

'es.index.auto.create'='TRUE',

'es.field.read.empty.as.null' ='TRUE',

'es.net.http.auth.user'='xxx',

'es.net.http.auth.pass'='=xxxx+1'

);

在ES中插入数据

INSERT OVERWRITE TABLE xxx.xxx_hive_es

SELECT

dropre.sku_xxx,

dropre.item_xxx,

dropre.item_xxx,

dropre.item_xxx,

dropre.xxprc

FROM

(SELECT aa.sku_xxx,aa.

item_xxx,

aa.item_xxx,

aa.item_xxx,

aa.xxprc,

aa.dt,

row_number()

over (partition by aa.sku_xxx order BY aa.xxprc) as rn

FROM app.app_*_* AS aa WHERE aa.dt='2023-02-25')

AS dropre

WHERE dropre.rn =1 AND dropre.sku_name<>'';

二、HIVE时间日期处理

对年取整数归0

SELECT trunc('2021-12-17','YYYY');

得到2021-01-01

对月取整数归0

SELECT trunc(current_date(),'MM');

得到2021-01-01

上月1号

SELECT trunc(add_months(CURRENT_TIMESTAMP,-1),'MM');

日期减一个月

SELECT add_months(CURRENT_DATE,-1)

当前日期

SELECT CURRENT_DATE

去年的一月

SELECT

date_add(concat(year(current_date) - 1, '-01-01'), 0) AS january,

date_add(concat(year(current_date) - 1, '-02-01'), 0) AS february,

date_add(concat(year(current_date) - 1, '-03-01'), 0) AS march

FROM

your_table

参考资料

https://stackoverflow.com/questions/28674753/hive-runtime-error-while-processing-row-in-hive(向量化参数调优) https://codeantenna.com/a/BD4HSLCWqZ (对某一列去重) https://blog.csdn.net/weixin_42303955/article/details/113076713 (sh脚本编写HIVE) https://blog.csdn.net/jiegedalao/article/details/110855846(for循环参考链接)

好文链接

评论可见,请评论后查看内容,谢谢!!!
 您阅读本篇文章共花了: