diff --git a/COVID-19/spider.py b/COVID-19/spider.py deleted file mode 100644 index 8f4edb8..0000000 --- a/COVID-19/spider.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# @Time : 2021/2/9 11:06 -# @Author : way -# @Site : -# @Describe: - -import requests - - -# 重试装饰器 -def retry(func): - max_retry = 5 - - def run(*args, **kwargs): - for i in range(max_retry + 1): - if func(*args, **kwargs): - break - else: - print("retrying...") - else: - print("update fail !!!") - - return run - - -@retry -def download(url): - try: - file = url.split('/')[-1] - with open(file, 'w', encoding='utf-8-sig') as f: - f.write(requests.get(url).text) - print(f'{file} has been updated success') - return True - except Exception as e: - print(e) - return False - - -if __name__ == "__main__": - urls = [ - 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv', - 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv', - 'https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv' - ] - for url in urls: - download(url) diff --git "a/COVID-19/\346\226\260\345\206\240\347\226\253\346\203\205\346\225\260\346\215\256\345\210\206\346\236\220.ipynb" "b/COVID-19/\346\226\260\345\206\240\347\226\253\346\203\205\346\225\260\346\215\256\345\210\206\346\236\220.ipynb" index 9e95831..313cf01 100644 --- "a/COVID-19/\346\226\260\345\206\240\347\226\253\346\203\205\346\225\260\346\215\256\345\210\206\346\236\220.ipynb" +++ "b/COVID-19/\346\226\260\345\206\240\347\226\253\346\203\205\346\225\260\346\215\256\345\210\206\346\236\220.ipynb" @@ -8,7 +8,7 @@ "# 1、数据集说明\n", "\n", "这是一份来自 Johns Hopkins University 在github 开源的全球新冠肺炎 [COVID-19](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series) 数据集,每日时间序列汇总,包括确诊、死亡和治愈。所有数据来自每日病例报告。数据持续更新中。\n", - ">可以使用 python spider.py 获取最新的数据集文件。另外,由于数据集中没有美国的治愈数据,所以在统计全球的现有确诊人员和治愈率的时候会有很大误差,代码里面先不做这个处理,期待数据集的完善。\n" + ">由于数据集中没有美国的治愈数据,所以在统计全球的现有确诊人员和治愈率的时候会有很大误差,代码里面先不做这个处理,期待数据集的完善。\n" ] }, { @@ -199,8 +199,8 @@ " background: #F8F8F8;\n", " }\n", " \n", - "
\n", - "

(2/23/21)全球疫情情况

\n", + "
\n", + "

(3/20/21)全球疫情情况

\n", "

由于数据集没有美国的治愈数据,所以治愈人数和治愈率都远低于实际,等待数据集完善

\n", " \n", " \n", @@ -214,11 +214,11 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
1121081452485368632951392.22%56.46%1228137962709639695230872.21%56.61%
\n", @@ -226,7 +226,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 4, @@ -262,7 +262,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "closed-suicide", "metadata": {}, "outputs": [ @@ -278,13 +278,13 @@ " });\n", "\n", "\n", - "
\n", + "
\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -3593,7 +3593,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "corresponding-overhead", "metadata": {}, "outputs": [ @@ -3609,13 +3609,13 @@ " });\n", "\n", "\n", - "
\n", + "
\n", "\n", "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tl = Timeline()\n", - "tl.add_schema(\n", - "# is_auto_play=True,\n", - " is_loop_play=False,\n", - " play_interval=200,\n", - " )\n", - "target = confirmed_data.columns[6:].to_list()\n", - "target.reverse()\n", - "target = target[::7]\n", - "target.reverse()\n", - "for dt in target: \n", - " confirmed = confirmed_data.groupby('Country/Region').agg({dt: 'sum'}).to_dict()[dt]\n", - " c = (\n", - " Map()\n", - " .add(\"确诊人数\", [*confirmed.items()], \"world\", is_map_symbol_show=False)\n", - " .set_series_opts(label_opts=opts.LabelOpts(is_show=False))\n", - " .set_global_opts(\n", - " title_opts=opts.TitleOpts(title=\"全球疫情历史发展情况\"),\n", - " visualmap_opts=opts.VisualMapOpts(max_=200000), \n", - " \n", - " )\n", - " )\n", - " tl.add(c, dt)\n", - "tl.render_notebook()" - ] - }, - { - "cell_type": "markdown", - "id": "constitutional-macro", - "metadata": {}, - "source": [ - "### 3.1.3 各国确诊人数 TOP20 排行" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "accomplished-amazon", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "\n", - "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tl = Timeline()\n", + "tl.add_schema(\n", + "# is_auto_play=True,\n", + " is_loop_play=False,\n", + " play_interval=200,\n", + " )\n", + "target = confirmed_data.columns[6:].to_list()\n", + "target.reverse()\n", + "target = target[::7]\n", + "target.reverse()\n", + "for dt in target: \n", + " confirmed = confirmed_data.groupby('Country/Region').agg({dt: 'sum'}).to_dict()[dt]\n", + " c = (\n", + " Map()\n", + " .add(\"确诊人数\", [*confirmed.items()], \"world\", is_map_symbol_show=False)\n", + " .set_series_opts(label_opts=opts.LabelOpts(is_show=False))\n", + " .set_global_opts(\n", + " title_opts=opts.TitleOpts(title=\"全球疫情历史发展情况\"),\n", + " visualmap_opts=opts.VisualMapOpts(max_=200000), \n", + " \n", + " )\n", + " )\n", + " tl.add(c, dt)\n", + "tl.render_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "constitutional-macro", + "metadata": {}, + "source": [ + "### 3.1.3 各国确诊人数 TOP20 排行" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "accomplished-amazon", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "\n", + "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tl = Timeline()\n", - "tl.add_schema(\n", - "# is_auto_play=True,\n", - " is_loop_play=False,\n", - " play_interval=100,\n", - " )\n", - "\n", - "for dt in confirmed_data.columns[6:]:\n", - " confirmed = confirmed_data.groupby('Country/Region_zh').agg({dt: 'sum'}).sort_values(by=dt, ascending=False)[:20].sort_values(by=dt).to_dict()[dt]\n", - " bar = (\n", - " Bar()\n", - " .add_xaxis([*confirmed.keys()])\n", - " .add_yaxis(\"确诊人数\", [*confirmed.values()], label_opts=opts.LabelOpts(position=\"right\"))\n", - " .reversal_axis()\n", - " .set_global_opts(\n", - " title_opts=opts.TitleOpts(\"各国确诊人数排行 TOP20\")\n", - " )\n", - " )\n", - " tl.add(bar, dt)\n", - "tl.render_notebook()\n" - ] - }, - { - "cell_type": "markdown", - "id": "negative-portsmouth", - "metadata": {}, - "source": [ - "### 3.1.4 全球疫情趋势" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "seasonal-greensboro", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "\n", - "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tl = Timeline()\n", + "tl.add_schema(\n", + "# is_auto_play=True,\n", + " is_loop_play=False,\n", + " play_interval=100,\n", + " )\n", + "\n", + "for dt in confirmed_data.columns[6:]:\n", + " confirmed = confirmed_data.groupby('Country/Region_zh').agg({dt: 'sum'}).sort_values(by=dt, ascending=False)[:20].sort_values(by=dt).to_dict()[dt]\n", + " bar = (\n", + " Bar()\n", + " .add_xaxis([*confirmed.keys()])\n", + " .add_yaxis(\"确诊人数\", [*confirmed.values()], label_opts=opts.LabelOpts(position=\"right\"))\n", + " .reversal_axis()\n", + " .set_global_opts(\n", + " title_opts=opts.TitleOpts(\"各国确诊人数排行 TOP20\")\n", + " )\n", + " )\n", + " tl.add(bar, dt)\n", + "tl.render_notebook()\n" + ] + }, + { + "cell_type": "markdown", + "id": "negative-portsmouth", + "metadata": {}, + "source": [ + "### 3.1.4 全球疫情趋势" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "seasonal-greensboro", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "\n", + "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -134227,7 +142701,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "educated-computer", "metadata": {}, "outputs": [ @@ -134287,8 +142761,8 @@ " background: #F8F8F8;\n", " }\n", " \n", - "
\n", - "

(2/23/21)中国疫情情况

\n", + "
\n", + "

(3/20/21)中国疫情情况

\n", "

\n", " \n", " \n", @@ -134303,12 +142777,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
1017494842962204.76%94.57%6871025234849971674.73%94.78%507
\n", @@ -134316,10 +142790,10 @@ "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -134358,7 +142832,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "introductory-friendship", "metadata": {}, "outputs": [ @@ -134374,13 +142848,13 @@ " });\n", "\n", "\n", - "
\n", + "
\n", "\n", "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "confirmed = confirmed_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n", + "deaths = deaths_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n", + "recovered = recovered_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n", + "exists_confirmed = {key: value - deaths[key] - recovered[key] for key, value in confirmed.items()}\n", + "c = (\n", + " Map()\n", + " .add(\"确诊人数\", [*confirmed.items()], \"china\", is_map_symbol_show=False)\n", + " .add(\"治愈人数\", [*recovered.items()], \"china\", is_map_symbol_show=False)\n", + " .add(\"死亡人数\", [*deaths.items()], \"china\", is_map_symbol_show=False)\n", + " .add(\"现有确诊人数\", [*exists_confirmed.items()], \"china\", is_map_symbol_show=False)\n", + " .set_series_opts(label_opts=opts.LabelOpts(is_show=True))\n", + " .set_global_opts(\n", + " title_opts=opts.TitleOpts(title=f'({lastdate})中国疫情现状'),\n", + " visualmap_opts=opts.VisualMapOpts(max_=1000), \n", + " )\n", + ")\n", + "c.render_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "interpreted-feedback", + "metadata": {}, + "source": [ + "### 3.2.2 中国疫情历史发展情况" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "bound-civilian", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "\n", + "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "confirmed = confirmed_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n", - "deaths = deaths_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n", - "recovered = recovered_data_china.groupby('Province/State_zh').agg({lastdate: 'sum'}).to_dict()[lastdate]\n", - "exists_confirmed = {key: value - deaths[key] - recovered[key] for key, value in confirmed.items()}\n", - "c = (\n", - " Map()\n", - " .add(\"确诊人数\", [*confirmed.items()], \"china\", is_map_symbol_show=False)\n", - " .add(\"治愈人数\", [*recovered.items()], \"china\", is_map_symbol_show=False)\n", - " .add(\"死亡人数\", [*deaths.items()], \"china\", is_map_symbol_show=False)\n", - " .add(\"现有确诊人数\", [*exists_confirmed.items()], \"china\", is_map_symbol_show=False)\n", - " .set_series_opts(label_opts=opts.LabelOpts(is_show=True))\n", - " .set_global_opts(\n", - " title_opts=opts.TitleOpts(title=f'({lastdate})中国疫情现状'),\n", - " visualmap_opts=opts.VisualMapOpts(max_=1000), \n", - " )\n", - ")\n", - "c.render_notebook()" - ] - }, - { - "cell_type": "markdown", - "id": "interpreted-feedback", - "metadata": {}, - "source": [ - "### 3.2.2 中国疫情历史发展情况" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "bound-civilian", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "\n", - "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tl = Timeline()\n", + "tl.add_schema(\n", + "# is_auto_play=True,\n", + " is_loop_play=False,\n", + " play_interval=200,\n", + " )\n", + "target = confirmed_data_china.columns[6:].to_list()\n", + "target.reverse()\n", + "target = target[::7]\n", + "target.reverse()\n", + "for dt in target: \n", + " confirmed = confirmed_data_china.groupby('Province/State_zh').agg({dt: 'sum'}).to_dict()[dt]\n", + " c = (\n", + " Map()\n", + " .add(\"确诊人数\", [*confirmed.items()], \"china\", is_map_symbol_show=False)\n", + " .set_series_opts(label_opts=opts.LabelOpts(is_show=True))\n", + " .set_global_opts(\n", + " title_opts=opts.TitleOpts(title='中国疫情历史发展情况'),\n", + " visualmap_opts=opts.VisualMapOpts(max_=1000), \n", + " )\n", + " )\n", + " tl.add(c, dt)\n", + "tl.render_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "conventional-childhood", + "metadata": {}, + "source": [ + "### 3.2.3 各省确诊人数排行 TOP20" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ongoing-grenada", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "\n", + "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tl = Timeline()\n", - "tl.add_schema(\n", - "# is_auto_play=True,\n", - " is_loop_play=False,\n", - " play_interval=200,\n", - " )\n", - "target = confirmed_data_china.columns[6:].to_list()\n", - "target.reverse()\n", - "target = target[::7]\n", - "target.reverse()\n", - "for dt in target: \n", - " confirmed = confirmed_data_china.groupby('Province/State_zh').agg({dt: 'sum'}).to_dict()[dt]\n", - " c = (\n", - " Map()\n", - " .add(\"确诊人数\", [*confirmed.items()], \"china\", is_map_symbol_show=False)\n", - " .set_series_opts(label_opts=opts.LabelOpts(is_show=True))\n", - " .set_global_opts(\n", - " title_opts=opts.TitleOpts(title='中国疫情历史发展情况'),\n", - " visualmap_opts=opts.VisualMapOpts(max_=1000), \n", - " )\n", - " )\n", - " tl.add(c, dt)\n", - "tl.render_notebook()" - ] - }, - { - "cell_type": "markdown", - "id": "conventional-childhood", - "metadata": {}, - "source": [ - "### 3.2.3 各省确诊人数排行 TOP20" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "ongoing-grenada", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "\n", - "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -222292,7 +236303,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "decreased-insert", "metadata": {}, "outputs": [ @@ -222308,13 +236319,13 @@ " });\n", "\n", "\n", - "
\n", + "
\n", "\n", "\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } diff --git a/README.md b/README.md index 30b175d..73c6f05 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ | [10 万条厦门招聘数据分析](https://github.com/TurboWay/bigdata_analyse/blob/main/AmoyJob/2021厦门招聘数据分析.md) | 离线处理 | 清洗 pandas + 分析 hive + 可视化 ( hue + pyecharts ) + 预测 sklearn | [百度网盘](https://pan.baidu.com/s/1mco8dKb5o0qPd2kqsj7bNg) 提取码:9wx0| | [7000 条租房数据分析](https://github.com/TurboWay/bigdata_analyse/blob/main/RentFromDanke/租房数据分析.md) | 离线处理 | 清洗 pandas + 分析 sqlite + 可视化 matplotlib | [百度网盘](https://pan.baidu.com/s/1l1x5qurJdkyUxAuhknj_Qw) 提取码:9en3 | | [6000 条倒闭企业数据分析](https://nbviewer.jupyter.org/github/TurboWay/bigdata_analyse/blob/main/DeathCompany/倒闭企业数据分析.ipynb) | 离线处理 | 清洗 pandas + 分析 pandas + 可视化 (jupyter notebook + pyecharts) | [百度网盘](https://pan.baidu.com/s/1I6E6i4ZadxE9IlVPe3Bqwg) 提取码:xvgm | -| [COVID-19 疫情数据分析](https://nbviewer.jupyter.org/github/TurboWay/bigdata_analyse/blob/main/COVID-19/新冠疫情数据分析.ipynb) | 离线处理 | 数据获取 requests + 清洗 pandas + 分析 pandas + 可视化 (jupyter notebook + pyecharts) | [csse_covid_19_time_series](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series) | +| [COVID-19 疫情数据分析](https://nbviewer.jupyter.org/github/TurboWay/bigdata_analyse/blob/main/COVID-19/新冠疫情数据分析.ipynb) | 离线处理 | 数据获取 requests + 清洗 pandas + 分析 pandas + 可视化 (jupyter notebook + pyecharts) | [COVID-19](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series) 或者 [百度网盘](https://pan.baidu.com/s/1b45MqPwjEWPoTOuEXquVcw) 提取码:wgmg | ## refer