Skip to content

Commit

Permalink
github: enable v1 ops for files larger than 1Mb
Browse files Browse the repository at this point in the history
The github provider's `validate_v1_path` method was using the Github
`contents` API endpoint to retrieve metadata about the path, so that
it could determine whether a file or folder at that path already
exists and is of the correct type.  Unfortunately, the `contents`
endpoint also returns the file contents along with the metadata.
Because of this, GitHub will refuse to respond if the requested file
is bigger than 1Mb.  `validate_v1_path` has been updated to request a
recursive tree listing, which it can then search through for the given
path.

Fixes: [#OSF-5866]
  • Loading branch information
felliott committed Mar 17, 2016
1 parent 123a6da commit 6c294f5
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 40 deletions.
37 changes: 20 additions & 17 deletions tests/providers/github/test_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,15 +479,16 @@ def test_child_gets_branch(self):

@pytest.mark.asyncio
@pytest.mark.aiohttpretty
async def test_validate_v1_path_file(self, provider, content_repo_metadata_root_file_txt):
blob_path = 'file.txt'
blob_url = provider.build_repo_url('contents', blob_path)
blob_query = '?ref=' + provider.default_branch
blob_good_url = blob_url + blob_query
blob_bad_url = blob_url + '/' + blob_query
async def test_validate_v1_path_file(self, provider, branch_metadata, repo_tree_metadata_root):
branch_url = provider.build_repo_url('branches', provider.default_branch)
tree_url = provider.build_repo_url('git', 'trees',
branch_metadata['commit']['commit']['tree']['sha'],
recursive=1)

aiohttpretty.register_json_uri('GET', branch_url, body=branch_metadata)
aiohttpretty.register_json_uri('GET', tree_url, body=repo_tree_metadata_root)

aiohttpretty.register_json_uri('GET', blob_good_url, body=content_repo_metadata_root_file_txt)
aiohttpretty.register_json_uri('GET', blob_bad_url, body=content_repo_metadata_root_file_txt)
blob_path = 'file.txt'

try:
wb_path_v1 = await provider.validate_v1_path('/' + blob_path)
Expand All @@ -505,15 +506,17 @@ async def test_validate_v1_path_file(self, provider, content_repo_metadata_root_

@pytest.mark.asyncio
@pytest.mark.aiohttpretty
async def test_validate_v1_path_folder(self, provider, content_repo_metadata_root):
tree_path = 'folder'
tree_url = provider.build_repo_url('contents', tree_path)
tree_query = '?ref=' + provider.default_branch
tree_good_url = tree_url + tree_query
tree_bad_url = tree_url + '/' + tree_query

aiohttpretty.register_json_uri('GET', tree_good_url, body=content_repo_metadata_root)
aiohttpretty.register_json_uri('GET', tree_bad_url, body=content_repo_metadata_root)
async def test_validate_v1_path_folder(self, provider, branch_metadata, repo_tree_metadata_root):
branch_url = provider.build_repo_url('branches', provider.default_branch)
tree_url = provider.build_repo_url('git', 'trees',
branch_metadata['commit']['commit']['tree']['sha'],
recursive=1)

aiohttpretty.register_json_uri('GET', branch_url, body=branch_metadata)
aiohttpretty.register_json_uri('GET', tree_url, body=repo_tree_metadata_root)

tree_path = 'level1'

try:
wb_path_v1 = await provider.validate_v1_path('/' + tree_path + '/')
except Exception as exc:
Expand Down
9 changes: 9 additions & 0 deletions waterbutler/providers/github/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from waterbutler.core.exceptions import ProviderError


class GitHubUnsupportedRepoError(ProviderError):
def __init__(self):
message = ('Some folder operations on large GitHub repositories cannot be supported without'
' data loss. To carry out this operation, please perform it in a local git'
' repository, then push to the target repository on GitHub.')
super()._init(message, code=501)
62 changes: 39 additions & 23 deletions waterbutler/providers/github/provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from waterbutler.providers.github.metadata import GitHubFolderContentMetadata
from waterbutler.providers.github.metadata import GitHubFileTreeMetadata
from waterbutler.providers.github.metadata import GitHubFolderTreeMetadata
from waterbutler.providers.github.exceptions import GitHubUnsupportedRepoError


GIT_EMPTY_SHA = '4b825dc642cb6eb9a060e54bf8d69288fbee4904'
Expand All @@ -37,6 +38,25 @@ def child(self, name, _id=None, folder=False):


class GitHubProvider(provider.BaseProvider):
"""Provider for GitHub repositories.
**On paths:** WB and GH use slightly different default conventions for their paths, so we
often have to munge our WB paths before comparison. Here is a quick overview::
WB (dirs): wb_dir.path == 'foo/bar/' str(wb_dir) == '/foo/bar/'
WB (file): wb_file.path = 'foo/bar.txt' str(wb_file) == '/foo/bar.txt'
GH (dir): 'foo/bar'
GH (file): 'foo/bar.txt'
Quirks:
* git doesn't have a concept of empty folders, so this provider creates 0-byte ``.gitkeep``
files in the requested folder.
* The ``contents`` endpoint cannot be used to fetch metadata reliably for all files. Requesting
a file that is larger than 1Mb will result in a error response directing you to the ``blob``
endpoint. A recursive tree fetch may be used instead.
"""
NAME = 'github'
BASE_URL = settings.BASE_URL
VIEW_URL = settings.VIEW_URL
Expand Down Expand Up @@ -67,23 +87,8 @@ async def validate_v1_path(self, path, **kwargs):
self.default_branch = self._repo['default_branch']

branch_ref = kwargs.get('ref') or kwargs.get('branch') or self.default_branch

implicit_folder = path.endswith('/')

url = furl.furl(self.build_repo_url('contents', path))
url.args.update({'ref': branch_ref})
resp = await self.make_request(
'GET',
url.url,
expects=(200, ),
throws=exceptions.MetadataError
)

content = await resp.json()
explicit_folder = isinstance(content, list)

if implicit_folder != explicit_folder:
raise exceptions.NotFoundError(path)
branch_data = await self._fetch_branch(branch_ref)
await self._search_tree_for_path(path, branch_data['commit']['commit']['tree']['sha'])

path = GitHubPath(path)
for part in path.parts:
Expand Down Expand Up @@ -516,15 +521,26 @@ async def _fetch_tree(self, sha, recursive=False):
tree = await resp.json()

if tree['truncated']:
raise exceptions.ProviderError(
('Some folder operations on large GitHub repositories cannot be supported without'
' data loss. To carry out this operation, please perform it in a local git'
' repository, then push to the target repository on GitHub.'),
code=501
)
raise GitHubUnsupportedRepoError

return tree

async def _search_tree_for_path(self, path, tree_sha, recursive=True):
"""Search through the given tree for an entity matching the name and type of `path`.
"""
tree = await self._fetch_tree(tree_sha, recursive=True)

if tree['truncated']:
raise GitHubUnsupportedRepoError

implicit_type = 'tree' if path.endswith('/') else 'blob'

for entity in tree['tree']:
if entity['path'] == path.strip('/') and entity['type'] == implicit_type:
return entity

raise exceptions.NotFoundError(str(path))

async def _create_tree(self, tree):
resp = await self.make_request(
'POST',
Expand Down

0 comments on commit 6c294f5

Please sign in to comment.