From e3216391c8a77c2b92b28ba95aa4294bc86d2622 Mon Sep 17 00:00:00 2001 From: bmartel Date: Thu, 29 Jun 2023 11:54:10 -0500 Subject: [PATCH] fix: LSDV-5337: Pre-signed file proxy url clashing with already html encoded values causing errors in signature (#4447) * fix: LSDV-5337: Pre-signed file proxy url clashing with already html encoded values causing errors in signature * Update base_models.py * updating tests to account for change in assertions of url structure, adding test to cover fallback scenario * Adding a test to outline resolution and support of the upper limits of file uris in cloud storage * removing any vendor specifics, just keep the characters * don't need this in the stubs as it was triggering false positives on security --- label_studio/data_import/api.py | 9 +- label_studio/io_storages/base_models.py | 5 +- label_studio/tests/conftest.py | 8 ++ .../io_storages_presign_proxy.tavern.yml | 88 ++++++++++-- .../tests/tasks/test_presign_storage_data.py | 132 +++++++++++++++--- 5 files changed, 209 insertions(+), 33 deletions(-) diff --git a/label_studio/data_import/api.py b/label_studio/data_import/api.py index 4f6e4e64e6d..5f2870730ed 100644 --- a/label_studio/data_import/api.py +++ b/label_studio/data_import/api.py @@ -1,5 +1,6 @@ """This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license. """ +import base64 import time import requests import logging @@ -602,7 +603,13 @@ def get(self, request, *args, **kwargs): if not project.has_permission(request.user): return Response(status=status.HTTP_403_FORBIDDEN) - fileuri = unquote(fileuri) + # Attempt to base64 decode the fileuri + try: + fileuri = base64.urlsafe_b64decode(fileuri.encode()).decode() + # For backwards compatibility, try unquote if this fails + except Exception as exc: + logger.debug(f'Failed to decode base64 {fileuri} for task {task_id}: {exc} falling back to unquote') + fileuri = unquote(fileuri) try: resolved = task.resolve_storage_uri(fileuri, project) diff --git a/label_studio/io_storages/base_models.py b/label_studio/io_storages/base_models.py index bb22bdc6815..ddc1ad45ee0 100644 --- a/label_studio/io_storages/base_models.py +++ b/label_studio/io_storages/base_models.py @@ -1,5 +1,6 @@ """This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license. """ +import base64 import rq import json import logging @@ -9,7 +10,7 @@ from rq.job import Job from django_rq import job -from urllib.parse import urljoin, quote +from urllib.parse import urljoin from django.utils import timezone from django.db import models, transaction @@ -285,7 +286,7 @@ def resolve_uri(self, uri, task=None): reverse( "data_import:storage-data-presign", kwargs={"task_id": task.id} - ) + f'?fileuri={quote(extracted_uri)}' + ) + f"?fileuri={base64.urlsafe_b64encode(extracted_uri.encode()).decode()}" ) return uri.replace(extracted_uri, proxy_url) else: diff --git a/label_studio/tests/conftest.py b/label_studio/tests/conftest.py index eba7045130d..e18fbe04716 100644 --- a/label_studio/tests/conftest.py +++ b/label_studio/tests/conftest.py @@ -139,6 +139,14 @@ def s3_with_hypertext_s3_links(s3): })) yield s3 +@pytest.fixture(autouse=True) +def s3_with_partially_encoded_s3_links(s3): + bucket_name = 'pytest-s3-json-partially-encoded' + s3.create_bucket(Bucket=bucket_name) + s3.put_object(Bucket=bucket_name, Key='test.json', Body=json.dumps({ + 'text': "" + })) + yield s3 @pytest.fixture(autouse=True) def s3_with_unexisted_links(s3): diff --git a/label_studio/tests/io_storages_presign_proxy.tavern.yml b/label_studio/tests/io_storages_presign_proxy.tavern.yml index e0d04c83aea..0023ed2a356 100644 --- a/label_studio/tests/io_storages_presign_proxy.tavern.yml +++ b/label_studio/tests/io_storages_presign_proxy.tavern.yml @@ -61,7 +61,7 @@ stages: response: json: data: - image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=s3.*//pytest-s3-images.+" + image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=czM6Ly9weXRlc3QtczMtaW1hZ2VzL2ltYWdlMS5qcGc=" status_code: 200 - name: stage request: @@ -82,7 +82,7 @@ stages: json: tasks: - data: - image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=s3.*//pytest-s3-images.+" + image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=czM6Ly9weXRlc3QtczMtaW1hZ2VzL2ltYWdlMS5qcGc=" status_code: 200 - name: stage request: @@ -303,7 +303,7 @@ stages: response: json: data: - image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//test-gs-bucket.+" + image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldC90ZXN0LWdzLWJ1Y2tldC9hYmM=" status_code: 200 --- test_name: test_invalidate_gcs_storage @@ -473,16 +473,16 @@ stages: response: json: data: - image: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+" + image: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc=" dict: - key1: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+" + key1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc=" array: - - !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+" - - !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+" + - !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc=" + - !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc=" array: - - item1: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+" + - item1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc=" some: "some text" - - item2: !re_match "/tasks/\\d+/presign/\\?fileuri=gs.*//whatever-bucket-with/manual.link.+" + - item2: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc=" some: "some text" status_code: 200 --- @@ -842,7 +842,7 @@ stages: response: json: data: - image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=azure-blob.*//pytest-azure-images.+" + image_url: !re_match "/tasks/\\d+/presign/\\?fileuri=YXp1cmUtYmxvYjovL3B5dGVzdC1henVyZS1pbWFnZXMvYWJj" status_code: 200 - name: stage request: @@ -1087,9 +1087,8 @@ stages: response: json: data: - pdf: !re_match " have been resolved) +test_name: test_import_jsons_from_s3_and_resolve_partially_encoded +strict: false +marks: +- usefixtures: + - django_live_url + - fflag_fix_all_lsdv_4711_cors_errors_accessing_task_data_short_on +stages: +- id: signup + type: ref +- name: stage + request: + data: + is_published: true + label_config: + title: test_s3_storage_with_json_and_hypertext + method: POST + url: '{django_live_url}/api/projects' + response: + save: + json: + project_pk: id + status_code: 201 +- name: stage + request: + data: + bucket: pytest-s3-json-partially-encoded + project: '{project_pk}' + title: Testing S3 storage 3 (bucket from conftest.py) + use_blob_urls: false + method: POST + url: '{django_live_url}/api/storages/s3' + response: + save: + json: + storage_pk: id + status_code: 201 +- name: stage + request: + method: POST + url: '{django_live_url}/api/storages/s3/{storage_pk}/sync' + response: + json: + last_sync_count: 1 + status_code: 200 +- name: stage + request: + method: GET + url: '{django_live_url}/api/projects/{project_pk}/tasks' + response: status_code: 200 +- name: stage + request: + method: GET + url: '{django_live_url}/api/projects/{project_pk}/next' + response: + json: + data: + text: !re_match "= 512 + + # Check this resolves correctly on the server + request = APIRequestFactory().get( + reverse("data_import:storage-data-presign", kwargs={"task_id": 1}) + + f"?fileuri={base64_encoded_uri}" + ) - request = APIRequestFactory().get(reverse("data_import:storage-data-presign", - kwargs={"task_id": 1}) + "?fileuri=fileuri") request.user = user force_authenticate(request, user) response = view(request, task_id=1) + # And that the response is correct assert response.status_code == status.HTTP_303_SEE_OTHER - assert response.url == "https://presigned-url.com/file" + assert response.url == "https://presigned-url.com/fileuri"