Skip to content

Commit

Permalink
remove upper-bound of github documents in fetcher
Browse files Browse the repository at this point in the history
  • Loading branch information
yujonglee committed Sep 18, 2024
1 parent 95a74aa commit 019d53e
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 83 deletions.
122 changes: 80 additions & 42 deletions core/lib/canary/sources/github_discussion_fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,51 @@ defmodule Canary.Sources.GithubDiscussion.Fetcher do
def run(%Source{
config: %Ash.Union{type: :github_discussion, value: %GithubDiscussion.Config{} = config}
}) do
{:ok, fetch_all(config.owner, config.repo)}
end

def fetch_all(owner, repo) do
Stream.unfold(nil, fn
:stop ->
nil

cursor ->
case fetch_page(owner, repo, cursor) do
{:ok, data} ->
page_info = data["repository"]["discussions"]["pageInfo"]
nodes = data["repository"]["discussions"]["nodes"]

if page_info["hasNextPage"] do
{nodes, page_info["endCursor"]}
else
{nodes, :stop}
end

{:try_after_s, seconds} ->
Process.sleep(seconds * 1000)
{[], cursor}

{:error, _} ->
{[], :stop}
end
end)
|> Stream.flat_map(fn nodes -> Enum.map(nodes, &transform_discussion_node/1) end)
|> Enum.to_list()
end

def fetch_page(owner, repo, cursor) do
result =
client()
|> Req.post(
graphql:
{"""
query ($owner: String!, $repo: String!, $discussion_n: Int!, $comment_n: Int!) {
query ($owner: String!, $repo: String!, $discussion_n: Int!, $comment_n: Int!, $cursor: String) {
repository(owner: $owner, name: $repo) {
discussions(first: $discussion_n, orderBy: {field: UPDATED_AT, direction: DESC}) {
discussions(first: $discussion_n, orderBy: {field: UPDATED_AT, direction: DESC}, after: $cursor) {
pageInfo {
endCursor
hasNextPage
}
nodes {
id
url
Expand Down Expand Up @@ -79,15 +116,18 @@ defmodule Canary.Sources.GithubDiscussion.Fetcher do
}
}
""",
Map.merge(
%{discussion_n: @default_discussion_n, comment_n: @default_comment_n},
%{repo: config.repo, owner: config.owner}
)}
%{
discussion_n: @default_discussion_n,
comment_n: @default_comment_n,
repo: repo,
owner: owner,
cursor: cursor
}}
)

case result do
{:ok, %{status: 200, body: %{"data" => data}}} ->
{:ok, to_document(data)}
{:ok, data}

# https://docs.github.com/en/graphql/overview/rate-limits-and-node-limits-for-the-graphql-api#exceeding-the-rate-limit
{:ok, %{status: 403, headers: headers}} ->
Expand All @@ -97,45 +137,43 @@ defmodule Canary.Sources.GithubDiscussion.Fetcher do
{:try_after_s, 60}
end

{:ok, %{status: 200, body: %{"errors" => errors}}} ->
{:error, errors}

{:error, error} ->
{:error, error}
end
end

defp to_document(data) do
discussions = data["repository"]["discussions"]["nodes"]

discussions
|> Enum.map(fn discussion ->
top = %GithubDiscussion.FetcherResult{
node_id: discussion["id"],
title: discussion["title"],
content: discussion["body"],
url: discussion["url"],
created_at: discussion["createdAt"],
author_name: discussion["author"]["login"],
author_avatar_url: discussion["author"]["avatarUrl"],
comment: false,
closed: discussion["closed"],
answered: discussion["isAnswered"]
}

comments =
discussion["comments"]["nodes"]
|> Enum.map(fn comment ->
%GithubDiscussion.FetcherResult{
node_id: comment["id"],
title: "",
content: comment["body"],
url: comment["url"],
created_at: comment["createdAt"],
author_name: comment["author"]["login"],
author_avatar_url: comment["author"]["avatarUrl"],
comment: true
}
end)

[top | comments]
end)
defp transform_discussion_node(discussion) do
top = %GithubDiscussion.FetcherResult{
node_id: discussion["id"],
title: discussion["title"],
content: discussion["body"],
url: discussion["url"],
created_at: discussion["createdAt"],
author_name: discussion["author"]["login"],
author_avatar_url: discussion["author"]["avatarUrl"],
comment: false,
closed: discussion["closed"],
answered: discussion["isAnswered"]
}

comments =
discussion["comments"]["nodes"]
|> Enum.map(fn comment ->
%GithubDiscussion.FetcherResult{
node_id: comment["id"],
title: "",
content: comment["body"],
url: comment["url"],
created_at: comment["createdAt"],
author_name: comment["author"]["login"],
author_avatar_url: comment["author"]["avatarUrl"],
comment: true
}
end)

[top | comments]
end
end
120 changes: 79 additions & 41 deletions core/lib/canary/sources/github_issue_fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,51 @@ defmodule Canary.Sources.GithubIssue.Fetcher do
end

def run(%Source{config: %Ash.Union{type: :github_issue, value: %GithubIssue.Config{} = config}}) do
{:ok, fetch_all(config.owner, config.repo)}
end

defp fetch_all(owner, repo) do
Stream.unfold(nil, fn
:stop ->
nil

cursor ->
case fetch_page(owner, repo, cursor) do
{:ok, data} ->
page_info = data["repository"]["issues"]["pageInfo"]
nodes = data["repository"]["issues"]["nodes"]

if page_info["hasNextPage"] do
{nodes, page_info["endCursor"]}
else
{nodes, :stop}
end

{:try_after_s, seconds} ->
Process.sleep(seconds * 1000)
{[], cursor}

{:error, _} ->
{[], :stop}
end
end)
|> Stream.flat_map(fn nodes -> Enum.map(nodes, &transform_issue_node/1) end)
|> Enum.to_list()
end

defp fetch_page(owner, repo, cursor) do
result =
client()
|> Req.post(
graphql:
{"""
query ($owner: String!, $repo: String!, $issue_n: Int!, $comment_n: Int!) {
query ($owner: String!, $repo: String!, $issue_n: Int!, $comment_n: Int!, $cursor: String) {
repository(owner: $owner, name: $repo) {
issues(first: $issue_n, orderBy: {field: UPDATED_AT, direction: DESC}) {
issues(first: $issue_n, orderBy: {field: UPDATED_AT, direction: DESC}, after: $cursor) {
pageInfo {
endCursor
hasNextPage
}
nodes {
id
bodyUrl
Expand Down Expand Up @@ -74,15 +111,18 @@ defmodule Canary.Sources.GithubIssue.Fetcher do
}
}
""",
Map.merge(
%{issue_n: @default_issue_n, comment_n: @default_comment_n},
%{repo: config.repo, owner: config.owner}
)}
%{
issue_n: @default_issue_n,
comment_n: @default_comment_n,
repo: repo,
owner: owner,
cursor: cursor
}}
)

case result do
{:ok, %{status: 200, body: %{"data" => data}}} ->
{:ok, process(data)}
{:ok, data}

# https://docs.github.com/en/graphql/overview/rate-limits-and-node-limits-for-the-graphql-api#exceeding-the-rate-limit
{:ok, %{status: 403, headers: headers}} ->
Expand All @@ -92,44 +132,42 @@ defmodule Canary.Sources.GithubIssue.Fetcher do
{:try_after_s, 60}
end

{:ok, %{status: 200, body: %{"errors" => errors}}} ->
{:error, errors}

{:error, error} ->
{:error, error}
end
end

defp process(data) do
issues = data["repository"]["issues"]["nodes"]

issues
|> Enum.map(fn issue ->
top = %GithubIssue.FetcherResult{
node_id: issue["id"],
title: issue["title"],
content: issue["body"],
url: issue["bodyUrl"],
created_at: issue["createdAt"],
author_name: issue["author"]["login"],
author_avatar_url: issue["author"]["avatarUrl"],
comment: false,
closed: issue["closed"]
}

comments =
issue["comments"]["nodes"]
|> Enum.map(fn comment ->
%GithubIssue.FetcherResult{
node_id: comment["id"],
title: "",
content: comment["body"],
url: comment["url"],
created_at: comment["createdAt"],
author_name: comment["author"]["login"],
author_avatar_url: comment["author"]["avatarUrl"],
comment: true
}
end)

[top | comments]
end)
defp transform_issue_node(issue) do
top = %GithubIssue.FetcherResult{
node_id: issue["id"],
title: issue["title"],
content: issue["body"],
url: issue["bodyUrl"],
created_at: issue["createdAt"],
author_name: issue["author"]["login"],
author_avatar_url: issue["author"]["avatarUrl"],
comment: false,
closed: issue["closed"]
}

comments =
issue["comments"]["nodes"]
|> Enum.map(fn comment ->
%GithubIssue.FetcherResult{
node_id: comment["id"],
title: "",
content: comment["body"],
url: comment["url"],
created_at: comment["createdAt"],
author_name: comment["author"]["login"],
author_avatar_url: comment["author"]["avatarUrl"],
comment: true
}
end)

[top | comments]
end
end

0 comments on commit 019d53e

Please sign in to comment.