Peter 工程日誌

Elasticsearch與MySQL上之資料查詢效率評估

前言

在本文章中,主要討論的是有關於Elasticsearch與MySQL資料庫之間對於資料查詢的速度間的差異與比較。(針對的是用電量資料做兩者查詢速度的評估,因為筆數較多)

透過我們所設計的查詢評估,來得到結論之外,同時也會有結論我們選擇哪一種資料庫來儲存我們的資料。

前置環境

在此文章開始前,有幾點需要去注意。

資料集介紹

MySQL

用電資料在MySQL上的資料表設定

建立資料表SQL方式如下


CREATE TABLE `electric_data_2019` (
`electricID` varchar(15) COLLATE utf8mb4_unicode_ci NOT NULL COMMENT '電號',
`date` datetime NOT NULL COMMENT '當下測量用電時間',
`mwh` double DEFAULT NULL COMMENT '用電量,單位wh'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='1000戶各用戶各時段用電量';


電量資料分區方式如下


ALTER TABLE `electric_data_2019`
PARTITION BY RANGE (TO_DAYS(`date`))
(
PARTITION p_201902 VALUES LESS THAN (TO_DAYS('2019-02-01 00:00:00')),
PARTITION p_201903 VALUES LESS THAN (TO_DAYS('2019-03-01 00:00:00')),
PARTITION p_201904 VALUES LESS THAN (TO_DAYS('2019-04-01 00:00:00')),
PARTITION p_201905 VALUES LESS THAN (TO_DAYS('2019-05-01 00:00:00')),
PARTITION p_201906 VALUES LESS THAN (TO_DAYS('2019-06-01 00:00:00')),
PARTITION p_201907 VALUES LESS THAN (TO_DAYS('2019-07-01 00:00:00')),
PARTITION p_201908 VALUES LESS THAN (TO_DAYS('2019-08-01 00:00:00')),
PARTITION p_201909 VALUES LESS THAN (TO_DAYS('2019-09-01 00:00:00')),
PARTITION p_201910 VALUES LESS THAN (TO_DAYS('2019-10-01 00:00:00')),
PARTITION p_201911 VALUES LESS THAN (TO_DAYS('2019-11-01 00:00:00')),
PARTITION p_201912 VALUES LESS THAN (TO_DAYS('2019-12-01 00:00:00')),
PARTITION p_202001 VALUES LESS THAN (TO_DAYS('2020-01-01 00:00:00')),
PARTITION p_max_future_dates VALUES LESS THAN MAXVALUE
);


分區的意思就是將2019年1月到12月分式,用月來分區

電量資料表建立索引方式


ALTER TABLE `electric_data_2019`
ADD PRIMARY KEY (`electricID`,`date`),
ADD KEY `electricID` (`electricID`),
ADD KEY `date` (`date`);


因為一個電號在一個時間下只會有一筆值,有鑑於此,在主鍵設定上將電號和測量日期看成一組主鍵。

Elasticsearch

查詢速度評估

我們針對查詢方式如下:

查詢速度評估結果

指定某一個電號在六月的時候查詢所有的用電資料所使用的bash script

evaluate_mysql_query_time.sh


#!/bin/bash
# DB: MySQL
# 指定某電號與月份下的所有資料

rm -f ./evaluate_mysql.sql
rm -f ./evaluate_mysql_result.txt

electric_number=$1
specific_month=$2
table_name="\`electric_data_2019\`"
sql_cmd=$(printf "select electricID, date, mwh from %s partition(p_%s) where electricID='%s';" "$table_name" "$specific_month" "$electric_number")

echo $sql_cmd >> ./evaluate_mysql.sql

counter=1
while [ $counter -le 10 ]
do
mysql -vvv -u root -p"password" -e"use electric_data; source evaluate_mysql.sql;" >> evaluate_mysql_result.txt
((counter++))
done

sum=0
evaluate_results=$(cat evaluate_mysql_result.txt | grep "rows in set" | awk '{print $5}' | sed 's/(//g')

for evaluate_result in $evaluate_results;
do
printf "execution time is %.2f sec\n" $evaluate_result
sum=$(echo "$sum+$evaluate_result" | bc)
done

echo "scale=2; $sum/10" | bc | awk '{printf "avaerage execution time is %.2f sec\n", $0}'

evaluate_es_query_time.sh


#!/bin/bash
# DB: ElasticSearch
# 指定某電號與月份下的所有資料

rm -f result.json

curl -o /dev/null -s -X PUT 127.0.0.1:9200/electric_number/_settings -d '{ "index.max_result_window" :"45000"}' -H "Content-Type: application/json"
printf "\n"

size=45000
start_date="20180601"
end_date="20180701"
counter=1
sum=0

form_data=$(printf '
{
"size": "%d",
"query": {
"range": {
"date": {
"gte": "%s",
"lt": "%s",
"format": "yyyyMMdd"
}
}
}
}
' $size $start_date $end_date)

while [ $counter -le 10 ]
do
execution_time=$(curl -o result.json -s -w '%{time_total}\n' -X GET "127.0.0.1:9200/electric_number/_search" -H 'Content-Type: application/json' -d "$form_data")

records=$(cat result.json | jq '.hits.hits | length')

sum=$(echo "scale=3; $sum+$execution_time" | bc)

counter=$(($counter+1))

done;

echo "scale=3; $sum/10" | bc | awk '{printf "average execution time is %.3f sec\n", $0}'

定某一個電號在六月中某一天所有的用電量資料所使用的bash script

evaluate_mysql_query_perday.sh


#!/bin/bash
# DB: MySQL
# 指定某電號與月份下的某一天所有資料

rm -f ./evaluate_mysql.sql
rm -f ./evaluate_mysql_result.txt

electric_number=$1
specific_month=$2
table_name="\`electric_data_2019\`"
sql_cmd=$(printf "select electricID, date, mwh from %s partition(p_%s) where electricID='%s' and date>='%s' and date<'%s';" "$table_name" "$specific_month" "$electric_number" "20190601" "20190602")

echo $sql_cmd >> ./evaluate_mysql.sql

counter=1
while [ $counter -le 10 ]
do
mysql -vvv -u root -p"password" -e"use electric_data; source evaluate_mysql.sql;" >> evaluate_mysql_result.txt
((counter++))
done

sum=0
evaluate_results=$(cat evaluate_mysql_result.txt | grep "rows in set" | awk '{print $5}' | sed 's/(//g')

for evaluate_result in $evaluate_results;
do
printf "execution time is %.2f sec\n" $evaluate_result
sum=$(echo "$sum+$evaluate_result" | bc)
done

echo "scale=3; $sum/10" | bc | awk '{printf "avaerage execution time is %.2f sec\n", $0}'

evaluate_es_query_perday.sh


#!/bin/bash
# DB: ElasticSearch
# 指定某電號與月份下的一天所有資料

rm -f result.json

counter=1
size=10000
start_date="20190601"
end_date="20190602"
sum=0
rows=0
while [ $counter -le 10 ]
do

form_data=$(printf '
{
"size": %d,
"query": {
"range": {
"date": {
"gte": "%s",
"lt": "%s",
"format": "yyyyMMdd"
}
}
}
}
' $size $start_date $end_date)

execution_time=$(curl -o result.json -s -w '%{time_total}\n' -X GET "127.0.0.1:9200/electric_number/_search" -H 'Content-Type: application/json' -d "$form_data")

records=$(cat result.json | jq '.hits.hits | length')

sum=$(echo "scale=4; $sum+$execution_time" | bc)

rm -f result.json

counter=$(($counter+1))

done;

echo "scale=4; $sum/10" | bc | awk '{printf "average execution time is %.4f sec\n", $0}'

結論

參考資料

作者群

Exit mobile version