php elasticsearch聚合查询实践

概念

聚合分类聚合语法聚合作用范围及排序聚合原理及 terms 精准度聚合实验

桶聚合指标聚合Pipeline 聚合实践一：多商户数据权限聚合分页实践二：多维度嵌套聚合实践三：删除 ES 索引重复数据附：实验环境

概念

用于聚合的字段必须是 exact value，即doc_value=true。分词字段不可进行聚合，对于 text 字段如需使用聚合，需开启 fielddata，不推荐因容易造成 OOM。

聚合分类

Bucket aggregations（桶聚合） Metric aggregations（指标聚合） Pipeline aggregations（管道聚合）

聚合语法

request

GET /my-index/_search

{

"aggs": {

"my-agg-name": {

"terms": {

"field": "my-field"

}

response

{

"took": 78,

"timed_out": false,

"_shards": {

"total": 1,

"successful": 1,

"skipped": 0,

"failed": 0

"hits": {

"total": {

"value": 5,

"relation": "eq"

"max_score": 1.0,

"hits": [...]

"aggregations": {

"my-agg-name": {

"doc_count_error_upper_bound": 0,

"sum_other_doc_count": 0,

"buckets": []

}

聚合作用范围及排序

query 和 filter，是先选定数据范围，再聚合桶； post_filter 对聚合桶没影响，桶全部返回，只对查询结果进行过滤返回，功能类似 mysql 中的 having； global 的作用是覆盖掉 query 的查询作用。

聚合原理及 terms 精准度

Terms Aggregation 的返回中有两个特殊的数值

doc_count_error_upper_bound：被遗漏的 term 分桶，包含的文档，有可能的最大值 sum_other_doc_count：除了返回结果 bucket 的 terms 以外，其他的 terms 的文档总数（总数-返回的总数）

聚合实验

“

实验数据引用自《Elasticsearch 核心技术与实战》- 阮一鸣（eBay Pronto 平台技术负责人）

创建索引

PUT /employees/

{

"mappings" : {

"properties" : {

"age" : {

"type" : "integer"

"gender" : {

"type" : "keyword"

"job" : {

"type" : "text",

"fields" : {

"keyword" : {

"type" : "keyword",

"ignore_above" : 50

}

"name" : {

"type" : "keyword"

"salary" : {

"type" : "integer"

}

批量写入数据

PUT /employees/_bulk

{ "index" : { "_id" : "1" } }

{ "name" : "Emma","age":32,"job":"Product Manager","gender":"female","salary":35000 }

{ "index" : { "_id" : "2" } }

{ "name" : "Underwood","age":41,"job":"Dev Manager","gender":"male","salary": 50000}

{ "index" : { "_id" : "3" } }

{ "name" : "Tran","age":25,"job":"Web Designer","gender":"male","salary":18000 }

{ "index" : { "_id" : "4" } }

{ "name" : "Rivera","age":26,"job":"Web Designer","gender":"female","salary": 22000}

{ "index" : { "_id" : "5" } }

{ "name" : "Rose","age":25,"job":"QA","gender":"female","salary":18000 }

{ "index" : { "_id" : "6" } }

{ "name" : "Lucy","age":31,"job":"QA","gender":"female","salary": 25000}

{ "index" : { "_id" : "7" } }

{ "name" : "Byrd","age":27,"job":"QA","gender":"male","salary":20000 }

{ "index" : { "_id" : "8" } }

{ "name" : "Foster","age":27,"job":"Java Programmer","gender":"male","salary": 20000}

{ "index" : { "_id" : "9" } }

{ "name" : "Gregory","age":32,"job":"Java Programmer","gender":"male","salary":22000 }

{ "index" : { "_id" : "10" } }

{ "name" : "Bryant","age":20,"job":"Java Programmer","gender":"male","salary": 9000}

{ "index" : { "_id" : "11" } }

{ "name" : "Jenny","age":36,"job":"Java Programmer","gender":"female","salary":38000 }

{ "index" : { "_id" : "12" } }

{ "name" : "Mcdonald","age":31,"job":"Java Programmer","gender":"male","salary": 32000}

{ "index" : { "_id" : "13" } }

{ "name" : "Jonthna","age":30,"job":"Java Programmer","gender":"female","salary":30000 }

{ "index" : { "_id" : "14" } }

{ "name" : "Marshall","age":32,"job":"Javascript Programmer","gender":"male","salary": 25000}

{ "index" : { "_id" : "15" } }

{ "name" : "King","age":33,"job":"Java Programmer","gender":"male","salary":28000 }

{ "index" : { "_id" : "16" } }

{ "name" : "Mccarthy","age":21,"job":"Javascript Programmer","gender":"male","salary": 16000}

{ "index" : { "_id" : "17" } }

{ "name" : "Goodwin","age":25,"job":"Javascript Programmer","gender":"male","salary": 16000}

{ "index" : { "_id" : "18" } }

{ "name" : "Catherine","age":29,"job":"Javascript Programmer","gender":"female","salary": 20000}

{ "index" : { "_id" : "19" } }

{ "name" : "Boone","age":30,"job":"DBA","gender":"male","salary": 30000}

{ "index" : { "_id" : "20" } }

{ "name" : "Kathy","age":29,"job":"DBA","gender":"female","salary": 20000}

桶聚合

对 keword 进行聚合

GET employees/_search

{

"size": 0,

"aggs": {

"jobs": {

"terms": {

"field":"job.keyword"

}

指标聚合

多个 Metric 聚合，找到最低最高和平均 salary

GET employees/_search

{

"size": 0,

"aggs": {

"max_salary": {

"max": {

"field": "salary"

}

"min_salary": {

"min": {

"field": "salary"

}

"avg_salary": {

"avg": {

"field": "salary"

}

多次嵌套，根据工作类型分桶，然后按照性别分桶，计算 salary 的统计信息

GET employees/_search

{

"size": 0,

"aggs": {

"job_gender_stats": {

"terms": {

"field": "job.keyword"

"aggs": {

"gender_stats": {

"terms": {

"field": "gender"

"aggs": {

"salary_stats": {

"stats": {

"field": "salary"

}

response

{

"took" : 6,

"timed_out" : false,

"_shards" : {

"total" : 1,

"successful" : 1,

"skipped" : 0,

"failed" : 0

"hits" : {

"total" : {

"value" : 20,

"relation" : "eq"

"max_score" : null,

"hits" : [ ]

"aggregations" : {

"job_gender_stats" : {

"doc_count_error_upper_bound" : 0,

"sum_other_doc_count" : 0,

"buckets" : [

{

"key" : "Java Programmer",

"doc_count" : 7,

"gender_stats" : {

"doc_count_error_upper_bound" : 0,

"sum_other_doc_count" : 0,

"buckets" : [

{

"key" : "male",

"doc_count" : 5,

"salary_stats" : {

"count" : 5,

"min" : 9000.0,

"max" : 32000.0,

"avg" : 22200.0,

"sum" : 111000.0

}

{

"key" : "female",

"doc_count" : 2,

"salary_stats" : {

"count" : 2,

"min" : 30000.0,

"max" : 38000.0,

"avg" : 34000.0,

"sum" : 68000.0

}

]

}

......

]

}

Pipeline 聚合

平均 salary 的统计分析

GET employees/_search

{

"size": 0,

"aggs": {

"jobs": {

"terms": {

"field": "job.keyword",

"size": 10

"aggs": {

"avg_salary": {

"avg": {

"field": "salary"

}

"stats_salary_by_job":{

"stats_bucket": {

"buckets_path": "jobs>avg_salary"

}

实践一：多商户数据权限聚合分页

collapse + cardinality 实现分页去重查询

GET my_order/_search

{

"from": 0,

"size": 6,

"track_total_hits": true,

"query": {

"bool": {

"must": [

{

"terms": {

"tenant_id": [

]

}

]

}

"aggs": {

"cidAgg": {

"cardinality": {

"field": "cid"

}

"collapse": {

"field": "cid"

}

注：不支持 search_after，导出推荐 scroll

实践二：多维度嵌套聚合

date_histogram 日期直方图 + terms 分桶聚合过去一周每天产生的工单量，每天各品类工单量，每天各品类排名前 N 的爆品等等。

GET my_order/_search

{

"size": 0,

"query": {

"bool": {

"filter": [

{

"range": {

"created_at": {

"gte": "2023-11-10",

"lte": "2023-11-16"

}

]

}

"aggs": {

"ranges": {

"date_histogram": {

"field": "created_at",

"format": "yyyy-MM-dd",

"calendar_interval": "day"

"aggs": {

"order_type_agg": {

"terms": {

"field": "order_type"

}

实践三：删除 ES 索引重复数据

核酸检测数据量大，数据存储选型如使用 elasticsearch、click house 等列数据库，数据重复是绕不开的话题，应用可通过计划任务等方式检测到重复数据并及时处理。

单字段查重

GET my_order/_search

{

"size": 0,

"query": {

"term": {

"tenant_id": 1

}

"aggs": {

"duplicateCount": {

"terms": {

"field": "cid",

"size": 1000,

"min_doc_count": 2

}

多字段查重

GET my_order/_search

{

"size": 0,

"aggs": {

"duplicateCount": {

"terms": {

"script": {

"lang": "painless",

"source": "doc['tenant_id'].value + doc['cid'].value"

"size": 100,

"min_doc_count": 2

}

数据查重并在 duplicateDocuments 数组展示细节

GET my_order/_search

{

"size": 0,

"aggs": {

"duplicateCount": {

"terms": {

"script": {

"lang": "painless",

"source": "doc['tenant_id'].value + doc['cid'].value"

"size": 100,

"min_doc_count": 2

"aggs": {

"duplicateDocuments": {

"top_hits": {}

}

查询到的重复数据记入日志，核实后使用_delete_by_query删除

POST my_order/_delete_by_query?conflicts=proceed&max_docs=1

{

"query": {

"term": {

"cid": 2

}

max_docs为 response 当前 key 中bucket.doc_count的数量-1

附php版本 demo 供参考

public function clearDuplicate()

{

$index = 'my_order';

$client = ClientBuilder::create()->build();

$params = [

'index' => $index,

'size' => 0,

'body' => [

'query' => [

...

'aggs' => [

'duplicateCount' => [

'terms' => [

'field' => 'cid',

'size' => 1000,

'min_doc_count' => 2

]

];

$result = $client->search($params);

$bucket = ArrayHelper::getValue($result, 'aggregations.duplicateCount.buckets');

if (!is_array($bucket) || empty($bucket)) {

return;

}

foreach ($bucket as $item) {

$maxDocs = ArrayHelper::getValue($item, 'doc_count', 0) - 1;

$key = ArrayHelper::getValue($item, 'key');

if ($maxDocs < 1 || empty($key)) {

continue;

}

$client->deleteByQuery([

'index' => $index,

'conflicts' => 'proceed',

'max_docs' => $maxDocs,

'body' => [

'query' => [

'bool' => [

...

]);

}

附：实验环境

linux 操作系统

$ uname -a

Linux LAPTOP-QK4HAU1D 5.15.90.1-microsoft-standard-WSL2 #1 SMP Fri Jan 27 02:56:13 UTC 2023 x86_64 x86_64 x86_64 GNU/Linux

$ cat /etc/issue

Ubuntu 22.04.2 LTS \n \l

elasticsearch 版本

GET /

{

"name" : "elasticsearch",

"cluster_name" : "docker-cluster",

"cluster_uuid" : "6xwN3rfbQ2KGgQdt8IUKqg",

"version" : {

"number" : "7.16.2",

"build_flavor" : "default",

"build_type" : "docker",

"build_hash" : "2b937c44140b6559905130a8650c64dbd0879cfb",

"build_date" : "2021-12-18T19:42:46.604893745Z",

"build_snapshot" : false,

"lucene_version" : "8.10.1",

"minimum_wire_compatibility_version" : "6.8.0",

"minimum_index_compatibility_version" : "6.0.0-beta1"

"tagline" : "You Know, for Search"

}

扫码关注微信公众号

参考： [1] https://gitee.com/geektime-geekbang/geektime-ELK [2] https://www.elastic.co/guide/en/elasticsearch/reference/7.17/search-aggregations.html

推荐链接

评论可见，请评论后查看内容，谢谢！！！

您阅读本篇文章共花了：

金钥匙

php elasticsearch聚合查询实践

大数据搜索引擎 Elasticsearch 核心技术（十）：GEO 地理查询（geo

git实用命令 git常用分支命令

发表评论取消回复

金钥匙

php elasticsearch聚合查询实践

大数据 搜索引擎 Elasticsearch 核心技术（十）：GEO 地理查询（geo

git实用命令 git常用分支命令

相关文章

发表评论取消回复

大数据搜索引擎 Elasticsearch 核心技术（十）：GEO 地理查询（geo