title: ETL复习(1) author: Gamehu tags:
最近找工作,有些大数据岗位我想投,但是奈何之前的工作内容大数据不是主业,大数据经验不够看,我最早要追溯到15年当时spark+hive,然后17年的storm+hbase,到最近的flink+ck,我觉得我努把力看能不能够一够大数据相关的岗位。
把我给媳妇儿配的打LOL的电脑,偷偷拿来用一用,当成小型服务器,反正性能对LOL来说,很过剩了,不影响。
我之前鼓捣其它技术的时候就在电脑上装了虚拟机,所以也不折腾了,直接装个ubuntu,然后装个docker+docker compose,就差不多了。
单独说下,因为docker默认用的国外的镜像源所以安装后几乎是不可用的,这时候需要配置国内的镜像。 要注意验证镜像源,比如通过curl等命令,看是否能正常访问是否能免验证访问,我就是被阿里云的镜像加速器耽搁了小半小时,就是按照官方的配置始终403,最后才发现,原理阿里前几个月更新了协议,大概意思是,不再支持外部直接用加速镜像,而是支持阿里云本身的产品使用。
{% codeblock %}
# 1. 验证镜像源
curl 镜像源
# 2. 添加镜像源
sudo mkdir -p /etc/docker
sudo tee /etc/docker/daemon.json <<-'EOF'
{
"registry-mirrors": [
"https://xxxx"
]
}
EOF
# 3. 使其生效
sudo systemctl daemon-reload
sudo systemctl restart docker
# 4. 查看镜像是否修改成功
docker info
# 5. 拉取镜像验证
docker pull xxx
{% endcodeblock %}
{% codeblock %}
# 1. 获取ck镜像
docker pull clickhouse/clickhouse-server
# 2. 添加ck需要的目录
mkdir -p /data/clickhouse/data /data/clickhouse/config /data/clickhouse/logs
# 3. ck的配置
cat > /data/clickhouse/config/config.xml << EOF
<?xml version="1.0"?>
<yandex>
<logger>
<level>information</level>
<log>/var/log/clickhouse-server/clickhouse-server.log</log>
<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
</logger>
<http_port>8123</http_port>
<tcp_port>9000</tcp_port>
<interserver_http_port>9009</interserver_http_port>
<listen_host>0.0.0.0</listen_host>
<max_connections>4096</max_connections>
<keep_alive_timeout>10</keep_alive_timeout>
<max_concurrent_queries>100</max_concurrent_queries>
<uncompressed_cache_size>8589934592</uncompressed_cache_size>
<mark_cache_size>5368709120</mark_cache_size>
<path>/var/lib/clickhouse/</path>
<tmp_path>/var/lib/clickhouse/tmp/</tmp_path>
<user_directories>
<users_xml>
<path>/etc/clickhouse-server/users.xml</path>
</users_xml>
</user_directories>
<timezone>UTC</timezone>
</yandex>
EOF
# 4. ck用户管理
cat > /data/clickhouse/config/users.xml << EOF
<?xml version="1.0"?>
<yandex>
<users>
<default>
<password>yourpassword</password>
<networks>
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
</default>
</users>
<profiles>
<default>
<max_memory_usage>10000000000</max_memory_usage>
<use_uncompressed_cache>0</use_uncompressed_cache>
<load_balancing>random</load_balancing>
</default>
</profiles>
<quotas>
<default>
<interval>
<duration>3600</duration>
<queries>0</queries>
<errors>0</errors>
<result_rows>0</result_rows>
<read_rows>0</read_rows>
<execution_time>0</execution_time>
</interval>
</default>
</quotas>
</yandex>
EOF
# 5.运行容器
docker run -d \
--name clickhouse-server \
--ulimit nofile=262144:262144 \
-p 8123:8123 \
-p 9000:9000 \
-p 9009:9009 \
-v /data/clickhouse/data:/var/lib/clickhouse \
-v /data/clickhouse/config/config.xml:/etc/clickhouse-server/config.xml \
-v /data/clickhouse/config/users.xml:/etc/clickhouse-server/users.xml \
-v /data/clickhouse/logs:/var/log/clickhouse-server \
--restart=always \
clickhouse/clickhouse-server:latest
# 6. 测试是否可用(内部)
docker exec -it clickhouse-server clickhouse-client --password yourpassword
# 7.暴露到外部可访问,由于不想每次run都写一长串,也为了后续方便管理其它容器,把docker compose装上
apt update
apt install -y docker-compose
# 8.compose文件编写,别忘了暴露environment
nano /data/clickhouse/docker-compose.yml
version: '3'
services:
clickhouse:
image: clickhouse/clickhouse-server:latest
container_name: clickhouse-server
restart: always
ports:
- "8123:8123"
- "9000:9000"
- "9009:9009"
volumes:
- /data/clickhouse/data:/var/lib/clickhouse
- /data/clickhouse/config/config.xml:/etc/clickhouse-server/config.xml
- /data/clickhouse/config/users.xml:/etc/clickhouse-server/users.xml
- /data/clickhouse/logs:/var/log/clickhouse-server
environment:
- CLICKHOUSE_USER=default
- CLICKHOUSE_PASSWORD=xxxx
ulimits:
nofile:
soft: 262144
hard: 262144
# 删除ck容器后重启
cd /data/clickhouse
docker-compose up -d
# 9. 看是否正常返回
curl "http://xx:8123/?user=default&password=xx&query=SELECT%201"
{% endcodeblock %}
https://www.coderjia.cn/archives/dba3f94c-a021-468a-8ac6-e840f85867ea https://hub.docker.com/r/clickhouse/clickhouse-server/