If you have opened this Door article, then for sure you have already dealt with ClickHouse and you can miss interesting details about its convenience and speed, and go straight to the point - in fact, how to create dictionaries and work with them in ClickHouse.
What are dictionaries in ClickHouse?
β key
β value
. ClickHouse, ETL- .
ClickHouse :
- ClickHouse .
-
TTL
β ClickHouse . - ClickHouse β XML- DDL-.
: / , HTTP(s) , ..
xml-, dictionaries_config
ClickHouse.
, dictionaries_lazy_load
.
( ) β .
, , system.dictionaries
, :
- ;
- ;
- , RAM .
xml DDL-. , β DDL-.
xml :
<yandex>
<!-- , -->
<comment>Some comments</comment>
<!-- , -->
<include_from>/etc/metrika.xml</include_from>
<dictionary>
<!-- -->
</dictionary>
...
<dictionary>
<!-- -->
</dictionary>
</yandex>
DDL-, .
:
<dictionary>
<name>clients</name>
<surce>
<clickhouse>
<host>myHostName</host>
<port>9000</port>
<user>admin</user>
<password>secret_password</password>
<db>clients</db>
<table>users</table>
<where>id<=10</where>
</clickhouse>
</surce>
<lifetime>
<min>3600</min>
<max>5400</max>
</lifetime>
<layout>
<flat/>
</layout>
<structure>
<id>user_id</id>
<attribute>
<name>username</name>
<type>string</type>
</attribute>
<attribute>
<name>age</name>
<type>Int8</type>
</attribute>
</structure>
</dictionary>
:
name
β ;source
β ;lifetime
β ;layout
β . ;structure
β . , .
DDL-:
CREATE DICTIONARY dict_users_id (
id UInt64,
username String,
email String,
status UInt16,
hash String
)
PRIMARY KEY id
SOURCE(MYSQL(
port 3306
user clickhouse
password secret_password
replica(host 'mysql1.fevlake.com' priority 1)
db fevlake_dicts
table users
))
LAYOUT(HASHED())
LIFETIME(MIN 3600 MAX 5400);
. β :
- HTTP(s)
β , .
:
<surce>
<file>
<path>/opt/dictionaries/clients.csv</path>
<format>CSV</format>
</file>
</surce>
:
path
β .format
β . ClickHouse.
DDL-:
SOURCE(FILE(path '/opt/dictionaries/clients.csv' format 'CSV'))
SETTINGS(format_csv_allow_single_quotes = 0)
MySQL .
:
<surce>
<mysql>
<port>3306</port>
<user>clickhouse</user>
<password>secret_password</password>
<replica>
<host>example01-1</host>
<priority>1</priority>
</replica>
<replica>
<host>example01-2</host>
<priority>1</priority>
</replica>
<db>db_name</db>
<table>table_name</table>
<where>id=10</where>
<invalidate_query>SQL_QUERY</invalidate_query>
</mysql>
</surce>
port
β MySQL.<replica>
.user
β MySQL.<replica>
.password
β MySQL.<replica>
.replica
β . .db
β .table
β .where
β .WHERE
MySQL, ,id >= 3 AND id < 10
( ).invalidate_query
β ( ).
DDL-:
SOURCE(MYSQL(
port 3306
user clickhouse
password secret_password
replica(host 'mysql1.fevlake.com' priority 1)
db fevlake_dicts
table users
))
ClickHouse:
flat
hashed
sparse_hashed
cache
direct
range_hashed
complex_key_hashed
complex_key_cache
complex_key_direct
ip_trie
3, , β flat
, hashed
complex_key_hashed
. .
Flat
, . UInt64
500 000, ClickHouse .
.
:
<layout>
<flat/>
</layout>
LAYOUT(FLAT())
Hashed
- . , .
:
<layout>
<hashed/>
</layout>
LAYOUT(HASHED())
omplex_key_hashed
. hashed
.
:
<layout>
<hashed/>
</layout>
LAYOUT(COMPLEX_KEY_HASHED())
<structure>
, .
XML:
<structure>
<id>user_id</id>
<attribute>
<name>username</name>
<type>string</type>
</attribute>
<attribute>
<name>age</name>
<type>Int8</type>
</attribute>
</structure>
:
<id>
β ;<attribute>
β . .
ClickHouse :
- .
UInt64
.<id>
PRIMARY KEY
. - . .
<key>
PRIMARY KEY
.
: UInt64
.
:
<id>
<name>user_id</name>
</id>
CREATE DICTIONARY (
user_id UInt64,
...
)
PRIMARY KEY user_id
...
PRIMARY KEY
β .
(tuple
) . layout
complex_key_hashed
complex_key_cache
.
<key>
. , . :
<key>
<attribute>
<name>field1</name>
<type>String</type>
</attribute>
<attrbute>
<name>field2</name>
<type>UInt32</type>
</attribute>
...
</key>
CREATE DICTIONARY ( field1 String, field2 String ... )
PRIMARY KEY field1, field2
...
<structure>
...
<attribute>
<name>Name</name>
<type>ClickHouseDataType</type>
<null_value></null_value>
<expression>rand64()</expression>
<hierarchical>true</hierarchical>
<injective>true</injective>
<is_object_id>true</is_object_id>
</attribute>
</structure>
CREATE DICTIONARY somename (
Name ClickHouseDataType DEFAULT '' EXPRESSION rand64() HIERARCHICAL INJECTIVE IS_OBJECT_ID
)
ClickHouse
ClickHouse β IP (v4) .
, : ip String
country String
. GeoIP2 MaxMind.
MaxMind .mmdb API .
ClickHouse .mmdb, β MaxMind CSV, .
IP , :
GeoIP2-Country-Blocks-IPv4.csv
β IP ID ;GeoIP2-Country-Locations-en.csv
β .
, DDL:
CREATE DICTIONARY dicts.geoip_country_blocks_ipv4 (
network String DEFAULT '',
geoname_id UInt64 DEFAULT 0,
registered_country_geoname_id UInt64 DEFAULT 0,
represented_country_geoname_id UInt64 DEFAULT 0,
is_anonymous_proxy UInt8 DEFAULT 0,
is_satellite_provider UInt8 DEFAULT 0
)
PRIMARY KEY network
SOURCE(FILE(
path '/var/lib/clickhouse/user_files/GeoIP2-Country-Blocks-IPv4.csv'
format 'CSVWithNames'
))
LAYOUT(IP_TRIE())
LIFETIME(300);
geoip_country_blocks_ipv4
:
network
β IP , .geoname_id
β ID .
β CSV.
ClickHouse ID, ip_trie
. IP .
geoip_country_locations_en
:
CREATE DICTIONARY dicts.geoip_country_locations_en (
geoname_id UInt64 DEFAULT 0,
locale_code String DEFAULT '',
continent_code String DEFAULT '',
continent_name String DEFAULT '',
country_iso_code String DEFAULT '',
country_name String DEFAULT '',
is_in_european_union UInt8 DEFAULT 0
)
PRIMARY KEY geoname_id
SOURCE(FILE(
path '/var/lib/clickhouse/user_files/GeoIP2-Country-Locations-en.csv'
format 'CSVWithNames'
))
LAYOUT(HASHED())
LIFETIME(300);
ID . GeoIP2-Country-Locations-en.csv
:
geoname_id
β ID , , .country_name
β .
hashed
.
CSV .
, :
CREATE TEMPORARY TABLE user_visits (user_ip String, user_id UUID);
---
INSERT INTO user_visits VALUES
('178.248.237.68', generateUUIDv4()),
('82.192.95.175', generateUUIDv4());
. β dictGet*
:
SELECT
dictGetString('dicts.geoip_city_locations_en', 'country_name', users_country_id) AS users_country,
uniqs
FROM (
SELECT
dictGetUInt64('dicts.geoip_country_blocks_ipv4', 'geoname_id', tuple(IPv4StringToNum(user_ip))) AS users_country_id,
uniq(user_id) AS uniqs
FROM user_visits
GROUP BY users_country_id
);
:
ββusers_countryββ¬βuniqsββ
β Russia β 1 β
β Netherlands β 1 β
βββββββββββββββββ΄ββββββββ
2 rows in set. Elapsed: 0.003 sec.
:
-
user_ip
,ip_trie
-:tuple(IPv4StringToNum(user_ip))
; - , ID
users_country_id
:dictGetUInt64('geoip_country_blocks_ipv4', 'geoname_id', ...) as users_country_id
; - :
uniq(user_id) as uniq_users
; - ID , :
GROUP BY users_country_id
; - , ID , :
dictGetString('geoip_city_locations_en', 'country_name', users_country_id) AS users_country
.
. GeoIP2 , :)
. , ClickHouse .