Skip to content

Commit

Permalink
fix: LSDV-5337: Pre-signed file proxy url clashing with already html …
Browse files Browse the repository at this point in the history
…encoded values causing errors in signature (#4447)

* fix: LSDV-5337: Pre-signed file proxy url clashing with already html encoded values causing errors in signature

* Update base_models.py

* updating tests to account for change in assertions of url structure, adding test to cover fallback scenario

* Adding a test to outline resolution and support of the upper limits of file uris in cloud storage

* removing any vendor specifics, just keep the characters

* don't need this in the stubs as it was triggering false positives on security
  • Loading branch information
bmartel committed Jun 29, 2023
1 parent f0828fd commit e321639
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 33 deletions.
9 changes: 8 additions & 1 deletion label_studio/data_import/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license.
"""
import base64
import time
import requests
import logging
Expand Down Expand Up @@ -602,7 +603,13 @@ def get(self, request, *args, **kwargs):
if not project.has_permission(request.user):
return Response(status=status.HTTP_403_FORBIDDEN)

fileuri = unquote(fileuri)
# Attempt to base64 decode the fileuri
try:
fileuri = base64.urlsafe_b64decode(fileuri.encode()).decode()
# For backwards compatibility, try unquote if this fails
except Exception as exc:
logger.debug(f'Failed to decode base64 {fileuri} for task {task_id}: {exc} falling back to unquote')
fileuri = unquote(fileuri)

try:
resolved = task.resolve_storage_uri(fileuri, project)
Expand Down
5 changes: 3 additions & 2 deletions label_studio/io_storages/base_models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license.
"""
import base64
import rq
import json
import logging
Expand All @@ -9,7 +10,7 @@

from rq.job import Job
from django_rq import job
from urllib.parse import urljoin, quote
from urllib.parse import urljoin

from django.utils import timezone
from django.db import models, transaction
Expand Down Expand Up @@ -285,7 +286,7 @@ def resolve_uri(self, uri, task=None):
reverse(
"data_import:storage-data-presign",
kwargs={"task_id": task.id}
) + f'?fileuri={quote(extracted_uri)}'
) + f"?fileuri={base64.urlsafe_b64encode(extracted_uri.encode()).decode()}"
)
return uri.replace(extracted_uri, proxy_url)
else:
Expand Down
8 changes: 8 additions & 0 deletions label_studio/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,14 @@ def s3_with_hypertext_s3_links(s3):
}))
yield s3

@pytest.fixture(autouse=True)
def s3_with_partially_encoded_s3_links(s3):
bucket_name = 'pytest-s3-json-partially-encoded'
s3.create_bucket(Bucket=bucket_name)
s3.put_object(Bucket=bucket_name, Key='test.json', Body=json.dumps({
'text': "<a href=\"s3://hypertext-bucket/file with /spaces and' / ' / %2Bquotes%3D.jpg\"/>"
}))
yield s3

@pytest.fixture(autouse=True)
def s3_with_unexisted_links(s3):
Expand Down
88 changes: 75 additions & 13 deletions label_studio/tests/io_storages_presign_proxy.tavern.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ stages:
response:
json:
data:
image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=s3.*//pytest-s3-images.+"
image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=czM6Ly9weXRlc3QtczMtaW1hZ2VzL2ltYWdlMS5qcGc="
status_code: 200
- name: stage
request:
Expand All @@ -82,7 +82,7 @@ stages:
json:
tasks:
- data:
image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=s3.*//pytest-s3-images.+"
image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=czM6Ly9weXRlc3QtczMtaW1hZ2VzL2ltYWdlMS5qcGc="
status_code: 200
- name: stage
request:
Expand Down Expand Up @@ -303,7 +303,7 @@ stages:
response:
json:
data:
image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//test-gs-bucket.+"
image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldC90ZXN0LWdzLWJ1Y2tldC9hYmM="
status_code: 200
---
test_name: test_invalidate_gcs_storage
Expand Down Expand Up @@ -473,16 +473,16 @@ stages:
response:
json:
data:
image: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+"
image: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
dict:
key1: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+"
key1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
array:
- !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+"
- !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+"
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
array:
- item1: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+"
- item1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
some: "some text"
- item2: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+"
- item2: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
some: "some text"
status_code: 200
---
Expand Down Expand Up @@ -842,7 +842,7 @@ stages:
response:
json:
data:
image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=azure-blob.*//pytest-azure-images.+"
image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=YXp1cmUtYmxvYjovL3B5dGVzdC1henVyZS1pbWFnZXMvYWJj"
status_code: 200
- name: stage
request:
Expand Down Expand Up @@ -1087,9 +1087,8 @@ stages:
response:
json:
data:
pdf: !re_match "<embed src='/tasks/\\d+/presign/\\?fileuri=gs.*//heartex-test/file.pdf?.*"
pdf: !re_match "<embed src='/tasks/\\d+/presign/\\?fileuri=Z3M6Ly9oZWFydGV4LXRlc3QvZmlsZS5wZGY="
status_code: 200

---
# we check here 2 things:
# - that json blobs are successfully synced from bucket,
Expand Down Expand Up @@ -1153,9 +1152,72 @@ stages:
response:
json:
data:
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=s3.*//hypertext-bucket/file%20with%20/spaces%20and%27%20/%20%27%20/%20quotes.jpg"
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9oeXBlcnRleHQtYnVja2V0L2ZpbGUgd2l0aCAvc3BhY2VzIGFuZCcgLyAnIC8gcXVvdGVzLmpwZw=="
status_code: 200
---
# - Check that json blobs containing partially encoded contents resolve correctly from the bucket,
# - s3:// links inside hypertext (like <a href="s3://path/to/%2Bbucket%3D.jpg"/> have been resolved)
test_name: test_import_jsons_from_s3_and_resolve_partially_encoded
strict: false
marks:
- usefixtures:
- django_live_url
- fflag_fix_all_lsdv_4711_cors_errors_accessing_task_data_short_on
stages:
- id: signup
type: ref
- name: stage
request:
data:
is_published: true
label_config: <View><HyperText name="text" value="$text"/><Choices name="label"
toName="text"><Choice value="pos"/><Choice value="neg"/></Choices></View>
title: test_s3_storage_with_json_and_hypertext
method: POST
url: '{django_live_url}/api/projects'
response:
save:
json:
project_pk: id
status_code: 201
- name: stage
request:
data:
bucket: pytest-s3-json-partially-encoded
project: '{project_pk}'
title: Testing S3 storage 3 (bucket from conftest.py)
use_blob_urls: false
method: POST
url: '{django_live_url}/api/storages/s3'
response:
save:
json:
storage_pk: id
status_code: 201
- name: stage
request:
method: POST
url: '{django_live_url}/api/storages/s3/{storage_pk}/sync'
response:
json:
last_sync_count: 1
status_code: 200
- name: stage
request:
method: GET
url: '{django_live_url}/api/projects/{project_pk}/tasks'
response:
status_code: 200

- name: stage
request:
method: GET
url: '{django_live_url}/api/projects/{project_pk}/next'
response:
json:
data:
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9oeXBlcnRleHQtYnVja2V0L2ZpbGUgd2l0aCAvc3BhY2VzIGFuZCcgLyAnIC8gJTJCcXVvdGVzJTNELmpwZw=="
status_code: 200
---
# we don't fail when unexisted s3:// links occur in the list
test_name: test_import_json_with_unexisted_s3_links
Expand Down
Loading

0 comments on commit e321639

Please sign in to comment.