标签体系改造
Enoch

标签体系ES存储方案(es v5)

设置分词器

1
2
3
4
5
6
7
8
9
10
11
12
{
"settings": {
"analysis": {
"analyzer": {
"vertical": {
"type": "pattern",
"parttern": "\\|"
}
}
}
}
}

index

每个treeType为一个index,名称为trident_es_{treeTypeId}

type

info

对应打标签对象的物料表

label

存储标签

info和label为父子文档

schema

info

和物料表字段类型保持一致,string类型的都为keyword

label

label中有多个properties,设置索引模板

每个标签树对应三个字段:tree_{treeId}_label,tree_{treeId}_label_expire_time,tree_{treeId}_label_update_time,每个字段设置类型为text,设置分词器为\|

字段中存储多个标签,每个标签用|分割

1
2
3
4
5
6
7
{
"_source":{
"tree_1_label":"1_男|2_四川|3_中国",
"tree_1_label_expire_time":"1_2100-01-01T00:00:00+08:00|2_2100-01-01T00:00:00+08:00|3_2100-01-01T00:00:00+08:00",
"tree_1_label_update_time":"1_2100-01-01T00:00:00+08:00|2_2100-01-01T00:00:00+08:00|3_2100-01-01T00:00:00+08:00"
}
}

涉及改动

写入

原整体写入流程

  • 根据标签规则设置,生成相应的打标签sql(基于hive)
  • 将标签结果写入到hive中的all_temp_tag里(竖表,分区表)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
CREATE TABLE db_datastory_trident.all_temp_tags(
entity_id string COMMENT '实体ID',
label_val string COMMENT '标签值',
expire_time timestamp COMMENT '过期时间,默认为unix_timestamp()',
create_time timestamp COMMENT '创建时间,默认为unix_timestamp()'
) PARTITIONED BY (entity_type INT,label_date DATE,label_id INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' STORED AS textfile;

INSERT OVERWRITE TABLE db_datastory_trident.all_temp_tags PARTITION(entity_type=543, label_date='#{#cur_date}', label_id=3275)
SELECT tdm_cdp_tdm_user_91510124MA61X5CT68.user_id AS entity_id,
'覆盖四川人群' AS label_val,
FROM_UNIXTIME(4102416000) AS expire_time,
FROM_UNIXTIME(UNIX_TIMESTAMP()) AS create_time
FROM tdm_cdp.tdm_user_91510124MA61X5CT68 AS tdm_cdp_tdm_user_91510124MA61X5CT68
WHERE tdm_cdp_tdm_user_91510124MA61X5CT68.user_id IS NOT NULL
AND tdm_cdp_tdm_user_91510124MA61X5CT68.`province`='四川省';

  • 将结果竖表转成横表,刷到hbase和es中
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
## hbase_label_{treeTypeId}_temp_{labelId}
DROP TABLE IF EXISTS db_datastory_trident.hbase_label_543_temp_3275;

CREATE EXTERNAL TABLE db_datastory_trident.hbase_label_543_temp_3275(user_id string,label_3275 string,label_3275_update_time TIMESTAMP,label_3275_expire_time TIMESTAMP) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ( 'hbase.columns.mapping'=':key,labels:label_3275,labels:label_3275_update_time,labels:label_3275_expire_time') TBLPROPERTIES ( 'hbase.mapred.output.outputtable'='db_datastory_trident.hbase_label_543', 'hbase.table.name'='db_datastory_trident.hbase_label_543');

INSERT INTO TABLE db_datastory_trident.hbase_label_543_temp_3275
SELECT entity_id,
label_val,
create_time,
expire_time
FROM db_datastory_trident.all_temp_tags
WHERE entity_type=543
AND label_date='#{#cur_date}'
AND label_id=3275;

## tmp_trident_es_{treeTypeId}_label_{labelId}
DROP TABLE IF EXISTS db_datastory_trident.tmp_trident_es_543_label_3275;

CREATE EXTERNAL TABLE db_datastory_trident.tmp_trident_es_543_label_3275(user_id string,label_3275 string,label_3275_update_time TIMESTAMP,label_3275_expire_time TIMESTAMP) STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler' TBLPROPERTIES("es.nodes"="xdp-prod-es1:9200", "es.write.operation"="upsert", "es.update.retry.on.conflict"="10", "es.resource"="trident_es_543/label_default", "es.mapping.id"="user_id", "es.mapping.parent"="user_id", "es.mapping.routing"="user_id", "es.mapping.names"="user_id:user_id,label_3275:label_3275,label_3275_update_time:label_3275_update_time,label_3275_expire_time:label_3275_expire_time");

INSERT INTO TABLE db_datastory_trident.tmp_trident_es_543_label_3275
SELECT user_id,
label_3275,
label_3275_update_time,
label_3275_expire_time
FROM db_datastory_trident.hbase_label_543_temp_3275;

DROP TABLE IF EXISTS db_datastory_trident.hbase_label_543_temp_3275;
DROP TABLE IF EXISTS db_datastory_trident.tmp_trident_es_543_label_3275;

改动后的写入流程

  • 根据标签规则设置,生成相应的打标签sql(基于hive) ==> 不变
  • 将标签结果写入到hive中的all_temp_tag里(竖表,分区表) ==> 分区增加tree
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
CREATE TABLE db_datastory_trident.all_temp_tags(
entity_id string COMMENT '实体ID',
label_val string COMMENT '标签值',
expire_time timestamp COMMENT '过期时间,默认为unix_timestamp()',
create_time timestamp COMMENT '创建时间,默认为unix_timestamp()'
) PARTITIONED BY (entity_type INT,tree_id INT,label_date DATE,label_id INT)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001' STORED AS textfile;

INSERT OVERWRITE TABLE db_datastory_trident.all_temp_tags PARTITION(entity_type=543, tree_id=1, label_date='#{#cur_date}', label_id=3275)
SELECT tdm_cdp_tdm_user_91510124MA61X5CT68.user_id AS entity_id,
'覆盖四川人群' AS label_val,
FROM_UNIXTIME(4102416000) AS expire_time,
FROM_UNIXTIME(UNIX_TIMESTAMP()) AS create_time
FROM tdm_cdp.tdm_user_91510124MA61X5CT68 AS tdm_cdp_tdm_user_91510124MA61X5CT68
WHERE tdm_cdp_tdm_user_91510124MA61X5CT68.user_id IS NOT NULL
AND tdm_cdp_tdm_user_91510124MA61X5CT68.`province`='四川省';

  • 将结果竖表转成横表,刷到hbase和es中 ==> 修改逻辑,将多字段拼接成字符串
    跑单个标签时,也走这种逻辑,性能问题?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
## hbase_label_{treeTypeId}_{treeId}_temp
DROP TABLE IF EXISTS db_datastory_trident.hbase_label_543_1_temp_3275;

## 新增标签树时,需要alterhbase表,增加相应字段(3个)
CREATE EXTERNAL TABLE db_datastory_trident.hbase_label_543_1_temp_3275(user_id string,tree_1_label string,tree_1_label_update_time string,tree_1_label_expire_time string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ( 'hbase.columns.mapping'=':key,labels:tree_1_label,labels:tree_1_label_update_time,labels:tree_1_label_expire_time') TBLPROPERTIES ( 'hbase.mapred.output.outputtable'='db_datastory_trident.hbase_label_543', 'hbase.table.name'='db_datastory_trident.hbase_label_543');

INSERT INTO TABLE db_datastory_trident.hbase_label_543_1_temp_3275
SELECT entity_id,
CONCAT_WS('|' ,collect_set(concat(label_id,"_",label_val))),
CONCAT_WS('|' ,collect_set(concat(label_id,"_",create_time))),
CONCAT_WS('|' ,collect_set(concat(label_id,"_",expire_time)))
FROM db_datastory_trident.all_temp_tags
WHERE entity_type=543
AND tree_id=1
AND label_date='#{#cur_date}'
GROUP BY entity_id;


#00b2d95622fd49c1ba79064ac09842df 1955_标签测试_WSK|1956_teset|1957_DDD|1953_xdd_标签任务1|1970_标签测试_WSK12
#00b4e84465b8473c9cecb4cf35c41177 1955_标签测试_WSK|1956_teset|1970_标签测试_WSK12
#00b91497d78e4eefa45cfd87fc3f319d 1955_标签测试_WSK|1956_teset|1957_DDD|1953_xdd_标签任务1|1970_标签测试_WSK12

## tmp_trident_es_{treeTypeId}_{treeId}_label_{labelId}
DROP TABLE IF EXISTS db_datastory_trident.tmp_trident_es_543_1_label_3275;

CREATE EXTERNAL TABLE db_datastory_trident.tmp_trident_es_543_1_label_3275(user_id string,tree_1_label string,tree_1_label_update_time string,tree_1_label_expire_time string) STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler' TBLPROPERTIES("es.nodes"="xdp-prod-es1:9200", "es.write.operation"="upsert", "es.update.retry.on.conflict"="10", "es.resource"="trident_es_543/label_default", "es.mapping.id"="user_id", "es.mapping.parent"="user_id", "es.mapping.routing"="user_id", "es.mapping.names"="user_id:user_id,tree_1_label:tree_1_label,tree_1_label_update_time:tree_1_label_update_time,tree_1_label_expire_time:tree_1_label_expire_time");

INSERT INTO TABLE db_datastory_trident.tmp_trident_es_543_1_label_3275
SELECT user_id,
tree_1_label,
tree_1_label_update_time,
tree_1_label_expire_time
FROM db_datastory_trident.hbase_label_543_temp_3275;

DROP TABLE IF EXISTS db_datastory_trident.hbase_label_543_1_temp_3275;
DROP TABLE IF EXISTS db_datastory_trident.tmp_trident_es_543_1_label_3275;

强制要求标签名中不能有“|”

merge相关

修改

改后

删除

改后

查询

改后

人群包

修改相应的查询,仍然是has_child+query,query中采用bool+match的形式
涉及到标签过期的话,采用script?

待确认

1
2
3
4
5
6
7
8
{
"must": [
"1##match(内容,中国)",
"2##match(内容,贸易战)"
],
"mustNot": [],
"boolExpression": "1 OR 2"
}
  • 更新时的性能问题
    更新某个标签时,因为是字符串,所以不能直接update,需要用script的方式去更新。将值取出来,split,找到对应的标签,update取值

数组方案宣告失败。参考官网说明

方案2 数组

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
PUT test2/label_default/19b2a580d86946a7b886ac2e79b581f2/_create
{
"tree_2":[
"1641,客户有下单,1610990713000,4102416000000",
"1953,xdd_标签任务1,1610990713000,4102416000000"
]
}

PUT test2/label_default/1a0ec7855540498696a1ed50a16e1228/_create
{
"tree_2":[
"1641,客户有下单,1610990713000,4102416000000" ]
}

## 更新目前不行
POST test2/label_default/_update
{
"query": {
"bool": {
"must": [

{
"match": {
"tree_2": "xdd_标签任务1"
}
}
]
}
},
"script": {
"source": "List<String> source = ctx._source.tree_2; for(int i=0;i<source.size();i++){String[] tmp = source[i].split(\",\"); tmp[2] = params.updateTime; ctx._source.tree_2[i]=tmp.join(\",\"); }",
"params":{
"updateTime":1611115065000
},
"lang": "painless"
}
}

方案三 nested

1
2


 评论
评论插件加载失败
正在加载评论插件
由 Hexo 驱动 & 主题 Keep
访客数 访问量