forked from pingcap/ossinsight
-
Notifications
You must be signed in to change notification settings - Fork 0
/
import_from_bigquery.sql
115 lines (109 loc) · 4.63 KB
/
import_from_bigquery.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
CREATE TEMP TABLE archive AS SELECT
coalesce(id, '0') as id,
coalesce(type, '') as type,
created_at,
coalesce(repo.id, 0) as repo_id,
coalesce(
CASE repo.name
WHEN '/' THEN json_value(payload, '$.repo')
WHEN null THEN json_value(payload, '$.repo')
ELSE repo.name
END,
''
)
as repo_name,
coalesce(actor.id, 0) as actor_id,
coalesce(actor.login, '') as actor_login,
coalesce(json_value(payload, '$.pull_request.base.repo.language'), '') as language,
coalesce(json_value(payload, '$.pull_request.additions'), '0') as additions,
coalesce(json_value(payload, '$.pull_request.deletions'), '0') as deletions,
coalesce(json_value(payload, '$.action'), '') as action,
coalesce(
CASE
WHEN json_value(payload, '$.pull_request.number') is not null THEN json_value(payload, '$.pull_request.number')
WHEN json_value(payload, '$.issue.number') is not null THEN json_value(payload, '$.issue.number')
ELSE json_value(payload, '$.number')
END,
'0')
as number,
coalesce(json_value(payload, '$.comment.commit_id'), '') as commit_id,
coalesce(json_value(payload, '$.comment.id'), '0') as comment_id,
coalesce(json_value(payload, '$.size'), '0') as push_size,
coalesce(json_value(payload, '$.distinct_size'), '0') as push_distinct_size,
coalesce(org.login, '') as org_login,
coalesce(org.id, 0) as org_id,
coalesce(
CASE
WHEN json_value(payload, '$.pull_request.state') is not null THEN json_value(payload, '$.pull_request.state')
WHEN json_value(payload, '$.issue.state') is not null THEN json_value(payload, '$.issue.state')
ELSE null
END,
'')
as state,
coalesce(
CASE
WHEN json_value(payload, '$.pull_request.closed_at') is not null THEN json_value(payload, '$.pull_request.closed_at')
WHEN json_value(payload, '$.issue.closed_at') is not null THEN json_value(payload, '$.issue.closed_at')
ELSE null
END,
'1970-01-01 00:00:00')
as closed_at,
coalesce(
CASE
WHEN json_value(payload, '$.pull_request.comments') is not null THEN json_value(payload, '$.pull_request.comments')
WHEN json_value(payload, '$.issue.comments') is not null THEN json_value(payload, '$.issue.comments')
ELSE null
END,
'0')
as comments,
coalesce(cast(cast(json_value(payload, '$.pull_request.merged') as bool) as int64), 0) as pr_merged,
coalesce(json_value(payload, '$.pull_request.merged_at'), '1970-01-01 00:00:00') as pr_merged_at,
coalesce(json_value(payload, '$.pull_request.changed_files'), '0') as pr_changed_files,
coalesce(json_value(payload, '$.pull_request.review_comments'), '0') as pr_review_comments,
coalesce(
CASE
WHEN json_value(payload, '$.pull_request.id') is not null THEN json_value(payload, '$.pull_request.id')
WHEN json_value(payload, '$.issue.id') is not null THEN json_value(payload, '$.issue.id')
ELSE null
END,
'')
as pr_or_issue_id,
coalesce(
CASE
WHEN json_value(payload, '$.comment.user.login') is not null THEN json_value(payload, '$.comment.user.login')
WHEN json_value(payload, '$.review.user.login') is not null THEN json_value(payload, '$.review.user.login')
WHEN json_value(payload, '$.issue.user.login') is not null THEN json_value(payload, '$.issue.user.login')
WHEN json_value(payload, '$.pull_request.user.login') is not null THEN json_value(payload, '$.pull_request.user.login')
ELSE null
END,
'')
as creator_user_login,
coalesce(
CASE
WHEN json_value(payload, '$.comment.user.id') is not null THEN json_value(payload, '$.comment.user.id')
WHEN json_value(payload, '$.review.user.id') is not null THEN json_value(payload, '$.review.user.id')
WHEN json_value(payload, '$.issue.user.id') is not null THEN json_value(payload, '$.issue.user.id')
WHEN json_value(payload, '$.pull_request.user.id') is not null THEN json_value(payload, '$.pull_request.user.id')
ELSE null
END,
'0')
as creator_user_id,
coalesce(
CASE
WHEN json_value(payload, '$.issue.created_at') is not null THEN json_value(payload, '$.issue.created_at')
WHEN json_value(payload, '$.pull_request.created_at') is not null THEN json_value(payload, '$.pull_request.created_at')
ELSE null
END,
'1970-01-01 00:00:00')
as pr_or_issue_created_at,
FORMAT_DATE('%Y-%m-%d', created_at) as event_day,
FORMAT_DATE('%Y-%m-01', created_at) as event_month,
EXTRACT(year FROM created_at) as event_year
FROM `githubarchive.day.2*`;
EXPORT DATA OPTIONS(
uri='gs://gharchive.live/new/gharchive_dev.github_events.*.csv',
format='CSV',
overwrite=true,
header=true,
field_delimiter=',') AS
SELECT * FROM archive