... | @@ -45,6 +45,10 @@ baidu_v 百度v |
... | @@ -45,6 +45,10 @@ baidu_v 百度v |
|
```
|
|
```
|
|
|
|
|
|
<!--spider_name-->
|
|
<!--spider_name-->
|
|
|
|
## 爬虫名称
|
|
|
|
```
|
|
|
|
baidu_v
|
|
|
|
```
|
|
|
|
|
|
## 代码地址
|
|
## 代码地址
|
|
```
|
|
```
|
... | @@ -70,6 +74,12 @@ baidu_v 百度v |
... | @@ -70,6 +74,12 @@ baidu_v 百度v |
|
## 任务来源
|
|
## 任务来源
|
|
<!--说明爬虫任务的输入。如:来自某个数据库表等。如果来自某个数据库表则应该简要说明该表内的数据是如何维护的。-->
|
|
<!--说明爬虫任务的输入。如:来自某个数据库表等。如果来自某个数据库表则应该简要说明该表内的数据是如何维护的。-->
|
|
```
|
|
```
|
|
|
|
crontab对应机器: 10.8.6.21
|
|
|
|
*/1 * * * * cd /home/collie/product/app_ad/jobs/data_pump && /home/collie/.conda/envs/collie/bin/python data_pump.py pump-data -c ../../data_pump/commit_baiduv_person_task.yml > /dev/null 2>&1
|
|
|
|
*/1 * * * * cd /home/collie/product/app_ad/jobs/data_pump && /home/collie/.conda/envs/collie/bin/python data_pump.py pump-data -c ../../data_pump/commit_baiduv_company_task.yml > /dev/null 2>&1
|
|
|
|
|
|
|
|
http://192.168.109.110/granite/project-collie-app/-/blob/baiduv_20211124/app_ad/data_pump/commit_baiduv_company_task.yml
|
|
|
|
http://192.168.109.110/granite/project-collie-app/-/blob/baiduv_20211124/app_ad/data_pump/commit_baiduv_person_task.yml
|
|
|
|
|
|
```
|
|
```
|
|
|
|
|
... | @@ -88,11 +98,44 @@ baidu_v 百度v |
... | @@ -88,11 +98,44 @@ baidu_v 百度v |
|
### 任务参数说明
|
|
### 任务参数说明
|
|
<!--特有参数说明,通用参数比如spider_name,task_params,task_src,task_result等不需说明-->
|
|
<!--特有参数说明,通用参数比如spider_name,task_params,task_src,task_result等不需说明-->
|
|
```
|
|
```
|
|
|
|
data_type: company
|
|
|
|
data_type: person
|
|
```
|
|
```
|
|
|
|
|
|
## data_type说明
|
|
## data_type说明
|
|
<!--可能产生的data_type说明-->
|
|
<!--可能产生的data_type说明-->
|
|
```
|
|
```
|
|
|
|
data_type: company
|
|
|
|
db:
|
|
|
|
host: bdp-rds-103.mysql.rds.aliyuncs.com
|
|
|
|
port: 3306
|
|
|
|
database: taskhub
|
|
|
|
user: taskhub_ro
|
|
|
|
password: thro@0818
|
|
|
|
|
|
|
|
query:
|
|
|
|
table: task_scheduler_ic
|
|
|
|
limit: 2500
|
|
|
|
columns:
|
|
|
|
- task_id
|
|
|
|
- task_params
|
|
|
|
|
|
|
|
|
|
|
|
data_type: person
|
|
|
|
db:
|
|
|
|
host: bdp-rds-103.mysql.rds.aliyuncs.com
|
|
|
|
port: 3306
|
|
|
|
database: taskhub
|
|
|
|
user: taskhub_ro
|
|
|
|
password: thro@0818
|
|
|
|
|
|
|
|
query:
|
|
|
|
table: task_scheduler_iich
|
|
|
|
limit: 2500
|
|
|
|
columns:
|
|
|
|
- task_id
|
|
|
|
- task_params
|
|
|
|
|
|
```
|
|
```
|
|
|
|
|
|
## 爬虫结果的超级数据
|
|
## 爬虫结果的超级数据
|
... | @@ -167,10 +210,28 @@ scrapy |
... | @@ -167,10 +210,28 @@ scrapy |
|
|
|
|
|
## Taskhub地址
|
|
## Taskhub地址
|
|
```
|
|
```
|
|
|
|
http://192.168.109.110/granite/project-taskhub/-/blob/master/taskhub/config/ad/config_ad.d/01_task.yaml
|
|
```
|
|
```
|
|
|
|
|
|
## Taskhub调度规则说明
|
|
## Taskhub调度规则说明
|
|
```
|
|
```
|
|
|
|
outbound:
|
|
|
|
- name: 'baiduv'
|
|
|
|
selector:
|
|
|
|
- "platform_name == 'baiduv'"
|
|
|
|
retry_limits: 5
|
|
|
|
token_per_second: 4
|
|
|
|
class: 'redis_queue.RedisListOutbound'
|
|
|
|
token_scope: 'baiduv'
|
|
|
|
init:
|
|
|
|
url: '{redis_url}'
|
|
|
|
key: "baiduv_company_name"
|
|
|
|
task_params:
|
|
|
|
- company_name
|
|
|
|
- platform_name
|
|
|
|
- data_type
|
|
|
|
task_params_wrapped: False
|
|
|
|
|
|
```
|
|
```
|
|
|
|
|
|
|
|
|
... | | ... | |