diff --git a/COVID-19/spider.py b/COVID-19/spider.py
deleted file mode 100644
index 8f4edb8..0000000
--- a/COVID-19/spider.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-# @Time : 2021/2/9 11:06
-# @Author : way
-# @Site :
-# @Describe:
-
-import requests
-
-
-# 重试装饰器
-def retry(func):
- max_retry = 5
-
- def run(*args, **kwargs):
- for i in range(max_retry + 1):
- if func(*args, **kwargs):
- break
- else:
- print("retrying...")
- else:
- print("update fail !!!")
-
- return run
-
-
-@retry
-def download(url):
- try:
- file = url.split('/')[-1]
- with open(file, 'w', encoding='utf-8-sig') as f:
- f.write(requests.get(url).text)
- print(f'{file} has been updated success')
- return True
- except Exception as e:
- print(e)
- return False
-
-
-if __name__ == "__main__":
- urls = [
- 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv',
- 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
- 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'
- ]
- for url in urls:
- download(url)
diff --git "a/COVID-19/\346\226\260\345\206\240\347\226\253\346\203\205\346\225\260\346\215\256\345\210\206\346\236\220.ipynb" "b/COVID-19/\346\226\260\345\206\240\347\226\253\346\203\205\346\225\260\346\215\256\345\210\206\346\236\220.ipynb"
index 9e95831..313cf01 100644
--- "a/COVID-19/\346\226\260\345\206\240\347\226\253\346\203\205\346\225\260\346\215\256\345\210\206\346\236\220.ipynb"
+++ "b/COVID-19/\346\226\260\345\206\240\347\226\253\346\203\205\346\225\260\346\215\256\345\210\206\346\236\220.ipynb"
@@ -8,7 +8,7 @@
"# 1、数据集说明\n",
"\n",
"这是一份来自 Johns Hopkins University 在github 开源的全球新冠肺炎 [COVID-19](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series) 数据集,每日时间序列汇总,包括确诊、死亡和治愈。所有数据来自每日病例报告。数据持续更新中。\n",
- ">可以使用 python spider.py 获取最新的数据集文件。另外,由于数据集中没有美国的治愈数据,所以在统计全球的现有确诊人员和治愈率的时候会有很大误差,代码里面先不做这个处理,期待数据集的完善。\n"
+ ">由于数据集中没有美国的治愈数据,所以在统计全球的现有确诊人员和治愈率的时候会有很大误差,代码里面先不做这个处理,期待数据集的完善。\n"
]
},
{
@@ -199,8 +199,8 @@
" background: #F8F8F8;\n",
" }\n",
" \n",
- "
\n",
- "
(2/23/21)全球疫情情况
\n",
+ "
\n",
+ "
(3/20/21)全球疫情情况
\n",
"
由于数据集没有美国的治愈数据,所以治愈人数和治愈率都远低于实际,等待数据集完善
\n",
"
\n",
" \n",
@@ -214,11 +214,11 @@
" \n",
" \n",
" \n",
- " 112108145 | \n",
- " 2485368 | \n",
- " 63295139 | \n",
- " 2.22% | \n",
- " 56.46% | \n",
+ " 122813796 | \n",
+ " 2709639 | \n",
+ " 69523087 | \n",
+ " 2.21% | \n",
+ " 56.61% | \n",
"
\n",
" \n",
"
\n",
@@ -226,7 +226,7 @@
"\n"
],
"text/plain": [
- "
"
+ ""
]
},
"execution_count": 4,
@@ -262,7 +262,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"id": "closed-suicide",
"metadata": {},
"outputs": [
@@ -278,13 +278,13 @@
" });\n",
"\n",
"\n",
- " \n",
+ " \n",
"\n",
"\n"
],
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -3593,7 +3593,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"id": "corresponding-overhead",
"metadata": {},
"outputs": [
@@ -3609,13 +3609,13 @@
" });\n",
"\n",
"\n",
- " \n",
+ " \n",
"\n",
"\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tl = Timeline()\n",
- "tl.add_schema(\n",
- "# is_auto_play=True,\n",
- " is_loop_play=False,\n",
- " play_interval=200,\n",
- " )\n",
- "target = confirmed_data.columns[6:].to_list()\n",
- "target.reverse()\n",
- "target = target[::7]\n",
- "target.reverse()\n",
- "for dt in target: \n",
- " confirmed = confirmed_data.groupby('Country/Region').agg({dt: 'sum'}).to_dict()[dt]\n",
- " c = (\n",
- " Map()\n",
- " .add(\"确诊人数\", [*confirmed.items()], \"world\", is_map_symbol_show=False)\n",
- " .set_series_opts(label_opts=opts.LabelOpts(is_show=False))\n",
- " .set_global_opts(\n",
- " title_opts=opts.TitleOpts(title=\"全球疫情历史发展情况\"),\n",
- " visualmap_opts=opts.VisualMapOpts(max_=200000), \n",
- " \n",
- " )\n",
- " )\n",
- " tl.add(c, dt)\n",
- "tl.render_notebook()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "constitutional-macro",
- "metadata": {},
- "source": [
- "### 3.1.3 各国确诊人数 TOP20 排行"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "accomplished-amazon",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- " \n",
- "\n",
- "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tl = Timeline()\n",
+ "tl.add_schema(\n",
+ "# is_auto_play=True,\n",
+ " is_loop_play=False,\n",
+ " play_interval=200,\n",
+ " )\n",
+ "target = confirmed_data.columns[6:].to_list()\n",
+ "target.reverse()\n",
+ "target = target[::7]\n",
+ "target.reverse()\n",
+ "for dt in target: \n",
+ " confirmed = confirmed_data.groupby('Country/Region').agg({dt: 'sum'}).to_dict()[dt]\n",
+ " c = (\n",
+ " Map()\n",
+ " .add(\"确诊人数\", [*confirmed.items()], \"world\", is_map_symbol_show=False)\n",
+ " .set_series_opts(label_opts=opts.LabelOpts(is_show=False))\n",
+ " .set_global_opts(\n",
+ " title_opts=opts.TitleOpts(title=\"全球疫情历史发展情况\"),\n",
+ " visualmap_opts=opts.VisualMapOpts(max_=200000), \n",
+ " \n",
+ " )\n",
+ " )\n",
+ " tl.add(c, dt)\n",
+ "tl.render_notebook()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "constitutional-macro",
+ "metadata": {},
+ "source": [
+ "### 3.1.3 各国确诊人数 TOP20 排行"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "accomplished-amazon",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ "\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tl = Timeline()\n",
- "tl.add_schema(\n",
- "# is_auto_play=True,\n",
- " is_loop_play=False,\n",
- " play_interval=100,\n",
- " )\n",
- "\n",
- "for dt in confirmed_data.columns[6:]:\n",
- " confirmed = confirmed_data.groupby('Country/Region_zh').agg({dt: 'sum'}).sort_values(by=dt, ascending=False)[:20].sort_values(by=dt).to_dict()[dt]\n",
- " bar = (\n",
- " Bar()\n",
- " .add_xaxis([*confirmed.keys()])\n",
- " .add_yaxis(\"确诊人数\", [*confirmed.values()], label_opts=opts.LabelOpts(position=\"right\"))\n",
- " .reversal_axis()\n",
- " .set_global_opts(\n",
- " title_opts=opts.TitleOpts(\"各国确诊人数排行 TOP20\")\n",
- " )\n",
- " )\n",
- " tl.add(bar, dt)\n",
- "tl.render_notebook()\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "negative-portsmouth",
- "metadata": {},
- "source": [
- "### 3.1.4 全球疫情趋势"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "seasonal-greensboro",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- " \n",
- "\n",
- "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tl = Timeline()\n",
+ "tl.add_schema(\n",
+ "# is_auto_play=True,\n",
+ " is_loop_play=False,\n",
+ " play_interval=100,\n",
+ " )\n",
+ "\n",
+ "for dt in confirmed_data.columns[6:]:\n",
+ " confirmed = confirmed_data.groupby('Country/Region_zh').agg({dt: 'sum'}).sort_values(by=dt, ascending=False)[:20].sort_values(by=dt).to_dict()[dt]\n",
+ " bar = (\n",
+ " Bar()\n",
+ " .add_xaxis([*confirmed.keys()])\n",
+ " .add_yaxis(\"确诊人数\", [*confirmed.values()], label_opts=opts.LabelOpts(position=\"right\"))\n",
+ " .reversal_axis()\n",
+ " .set_global_opts(\n",
+ " title_opts=opts.TitleOpts(\"各国确诊人数排行 TOP20\")\n",
+ " )\n",
+ " )\n",
+ " tl.add(bar, dt)\n",
+ "tl.render_notebook()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "negative-portsmouth",
+ "metadata": {},
+ "source": [
+ "### 3.1.4 全球疫情趋势"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "seasonal-greensboro",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ "\n"
],
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -134227,7 +142701,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
"id": "educated-computer",
"metadata": {},
"outputs": [
@@ -134287,8 +142761,8 @@
" background: #F8F8F8;\n",
" }\n",
" \n",
- " \n",
- "
(2/23/21)中国疫情情况
\n",
+ "
\n",
+ "
(3/20/21)中国疫情情况
\n",
"
\n",
"
\n",
" \n",
@@ -134303,12 +142777,12 @@
" \n",
" \n",
" \n",
- " 101749 | \n",
- " 4842 | \n",
- " 96220 | \n",
- " 4.76% | \n",
- " 94.57% | \n",
- " 687 | \n",
+ " 102523 | \n",
+ " 4849 | \n",
+ " 97167 | \n",
+ " 4.73% | \n",
+ " 94.78% | \n",
+ " 507 | \n",
"
\n",
" \n",
"
\n",
@@ -134316,10 +142790,10 @@
"\n"
],
"text/plain": [
- "
"
+ ""
]
},
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -134358,7 +142832,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
"id": "introductory-friendship",
"metadata": {},
"outputs": [
@@ -134374,13 +142848,13 @@
" });\n",
"\n",
"\n",
- " \n",
+ " \n",
"\n",
"\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "confirmed = confirmed_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n",
+ "deaths = deaths_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n",
+ "recovered = recovered_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n",
+ "exists_confirmed = {key: value - deaths[key] - recovered[key] for key, value in confirmed.items()}\n",
+ "c = (\n",
+ " Map()\n",
+ " .add(\"确诊人数\", [*confirmed.items()], \"china\", is_map_symbol_show=False)\n",
+ " .add(\"治愈人数\", [*recovered.items()], \"china\", is_map_symbol_show=False)\n",
+ " .add(\"死亡人数\", [*deaths.items()], \"china\", is_map_symbol_show=False)\n",
+ " .add(\"现有确诊人数\", [*exists_confirmed.items()], \"china\", is_map_symbol_show=False)\n",
+ " .set_series_opts(label_opts=opts.LabelOpts(is_show=True))\n",
+ " .set_global_opts(\n",
+ " title_opts=opts.TitleOpts(title=f'({lastdate})中国疫情现状'),\n",
+ " visualmap_opts=opts.VisualMapOpts(max_=1000), \n",
+ " )\n",
+ ")\n",
+ "c.render_notebook()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "interpreted-feedback",
+ "metadata": {},
+ "source": [
+ "### 3.2.2 中国疫情历史发展情况"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "bound-civilian",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ "\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "confirmed = confirmed_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n",
- "deaths = deaths_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n",
- "recovered = recovered_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n",
- "exists_confirmed = {key: value - deaths[key] - recovered[key] for key, value in confirmed.items()}\n",
- "c = (\n",
- " Map()\n",
- " .add(\"确诊人数\", [*confirmed.items()], \"china\", is_map_symbol_show=False)\n",
- " .add(\"治愈人数\", [*recovered.items()], \"china\", is_map_symbol_show=False)\n",
- " .add(\"死亡人数\", [*deaths.items()], \"china\", is_map_symbol_show=False)\n",
- " .add(\"现有确诊人数\", [*exists_confirmed.items()], \"china\", is_map_symbol_show=False)\n",
- " .set_series_opts(label_opts=opts.LabelOpts(is_show=True))\n",
- " .set_global_opts(\n",
- " title_opts=opts.TitleOpts(title=f'({lastdate})中国疫情现状'),\n",
- " visualmap_opts=opts.VisualMapOpts(max_=1000), \n",
- " )\n",
- ")\n",
- "c.render_notebook()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "interpreted-feedback",
- "metadata": {},
- "source": [
- "### 3.2.2 中国疫情历史发展情况"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "id": "bound-civilian",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- " \n",
- "\n",
- "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tl = Timeline()\n",
+ "tl.add_schema(\n",
+ "# is_auto_play=True,\n",
+ " is_loop_play=False,\n",
+ " play_interval=200,\n",
+ " )\n",
+ "target = confirmed_data_china.columns[6:].to_list()\n",
+ "target.reverse()\n",
+ "target = target[::7]\n",
+ "target.reverse()\n",
+ "for dt in target: \n",
+ " confirmed = confirmed_data_china.groupby('Province/State_zh').agg({dt: 'sum'}).to_dict()[dt]\n",
+ " c = (\n",
+ " Map()\n",
+ " .add(\"确诊人数\", [*confirmed.items()], \"china\", is_map_symbol_show=False)\n",
+ " .set_series_opts(label_opts=opts.LabelOpts(is_show=True))\n",
+ " .set_global_opts(\n",
+ " title_opts=opts.TitleOpts(title='中国疫情历史发展情况'),\n",
+ " visualmap_opts=opts.VisualMapOpts(max_=1000), \n",
+ " )\n",
+ " )\n",
+ " tl.add(c, dt)\n",
+ "tl.render_notebook()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "conventional-childhood",
+ "metadata": {},
+ "source": [
+ "### 3.2.3 各省确诊人数排行 TOP20"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "ongoing-grenada",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ "\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "tl = Timeline()\n",
- "tl.add_schema(\n",
- "# is_auto_play=True,\n",
- " is_loop_play=False,\n",
- " play_interval=200,\n",
- " )\n",
- "target = confirmed_data_china.columns[6:].to_list()\n",
- "target.reverse()\n",
- "target = target[::7]\n",
- "target.reverse()\n",
- "for dt in target: \n",
- " confirmed = confirmed_data_china.groupby('Province/State_zh').agg({dt: 'sum'}).to_dict()[dt]\n",
- " c = (\n",
- " Map()\n",
- " .add(\"确诊人数\", [*confirmed.items()], \"china\", is_map_symbol_show=False)\n",
- " .set_series_opts(label_opts=opts.LabelOpts(is_show=True))\n",
- " .set_global_opts(\n",
- " title_opts=opts.TitleOpts(title='中国疫情历史发展情况'),\n",
- " visualmap_opts=opts.VisualMapOpts(max_=1000), \n",
- " )\n",
- " )\n",
- " tl.add(c, dt)\n",
- "tl.render_notebook()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "conventional-childhood",
- "metadata": {},
- "source": [
- "### 3.2.3 各省确诊人数排行 TOP20"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "ongoing-grenada",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "\n",
- " \n",
- "\n",
- "\n"
],
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -222292,7 +236303,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 14,
"id": "decreased-insert",
"metadata": {},
"outputs": [
@@ -222308,13 +236319,13 @@
" });\n",
"\n",
"\n",
- " \n",
+ " \n",
"\n",
"\n"
],
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 13,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
diff --git a/README.md b/README.md
index 30b175d..73c6f05 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@
| [10 万条厦门招聘数据分析](https://github.com/TurboWay/bigdata_analyse/blob/main/AmoyJob/2021厦门招聘数据分析.md) | 离线处理 | 清洗 pandas + 分析 hive + 可视化 ( hue + pyecharts ) + 预测 sklearn | [百度网盘](https://pan.baidu.com/s/1mco8dKb5o0qPd2kqsj7bNg) 提取码:9wx0|
| [7000 条租房数据分析](https://github.com/TurboWay/bigdata_analyse/blob/main/RentFromDanke/租房数据分析.md) | 离线处理 | 清洗 pandas + 分析 sqlite + 可视化 matplotlib | [百度网盘](https://pan.baidu.com/s/1l1x5qurJdkyUxAuhknj_Qw) 提取码:9en3 |
| [6000 条倒闭企业数据分析](https://nbviewer.jupyter.org/github/TurboWay/bigdata_analyse/blob/main/DeathCompany/倒闭企业数据分析.ipynb) | 离线处理 | 清洗 pandas + 分析 pandas + 可视化 (jupyter notebook + pyecharts) | [百度网盘](https://pan.baidu.com/s/1I6E6i4ZadxE9IlVPe3Bqwg) 提取码:xvgm |
-| [COVID-19 疫情数据分析](https://nbviewer.jupyter.org/github/TurboWay/bigdata_analyse/blob/main/COVID-19/新冠疫情数据分析.ipynb) | 离线处理 | 数据获取 requests + 清洗 pandas + 分析 pandas + 可视化 (jupyter notebook + pyecharts) | [csse_covid_19_time_series](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series) |
+| [COVID-19 疫情数据分析](https://nbviewer.jupyter.org/github/TurboWay/bigdata_analyse/blob/main/COVID-19/新冠疫情数据分析.ipynb) | 离线处理 | 数据获取 requests + 清洗 pandas + 分析 pandas + 可视化 (jupyter notebook + pyecharts) | [COVID-19](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series) 或者 [百度网盘](https://pan.baidu.com/s/1b45MqPwjEWPoTOuEXquVcw) 提取码:wgmg |
## refer