diff --git a/dockerfiles/nginx/proxito.conf.template b/dockerfiles/nginx/proxito.conf.template index 844deb65dc1..99c189d0023 100644 --- a/dockerfiles/nginx/proxito.conf.template +++ b/dockerfiles/nginx/proxito.conf.template @@ -77,6 +77,8 @@ server { add_header X-RTD-Project-Method $rtd_project_method always; set $rtd_redirect $upstream_http_x_rtd_redirect; add_header X-RTD-Redirect $rtd_redirect always; + set $rtd_resolver_filename $upstream_http_x_rtd_resolver_filename; + add_header X-RTD-Resolver-Filename $rtd_resolver_filename always; set $cdn_cache_control $upstream_http_cdn_cache_control; add_header CDN-Cache-Control $cdn_cache_control always; set $cache_tag $upstream_http_cache_tag; diff --git a/readthedocs/proxito/middleware.py b/readthedocs/proxito/middleware.py index fbc200220f8..b6807967a74 100644 --- a/readthedocs/proxito/middleware.py +++ b/readthedocs/proxito/middleware.py @@ -15,9 +15,11 @@ ) from django.conf import settings from django.core.exceptions import SuspiciousOperation +from django.http.response import BadHeaderError, ResponseHeaders from django.shortcuts import redirect from django.urls import reverse from django.utils.deprecation import MiddlewareMixin +from django.utils.html import escape from readthedocs.builds.models import Version from readthedocs.core.unresolver import ( @@ -198,6 +200,7 @@ def _set_request_attributes(self, request, unresolved_domain): def process_request(self, request): # noqa # Initialize our custom request attributes. request.unresolved_domain = None + request.unresolved_url = None skip = any(request.path.startswith(reverse(view)) for view in self.skip_views) if skip: @@ -367,11 +370,33 @@ def _get_https_redirect(self, request): return None + def add_resolver_headers(self, request, response): + if request.unresolved_url is not None: + # TODO: add more ``X-RTD-Resolver-*`` headers + header_value = escape(request.unresolved_url.filename) + try: + # Use Django internals to validate the header's value before injecting it. + ResponseHeaders({})._convert_to_charset( + header_value, + "latin-1", + mime_encode=True, + ) + + response["X-RTD-Resolver-Filename"] = header_value + except BadHeaderError: + # Skip adding the header because it fails validation + log.info( + "Skip adding X-RTD-Resolver-Filename header due to invalid value.", + filename=request.unresolved_url.filename, + value=header_value, + ) + def process_response(self, request, response): # noqa self.add_proxito_headers(request, response) self.add_cache_headers(request, response) self.add_hsts_headers(request, response) self.add_user_headers(request, response) self.add_hosting_integrations_headers(request, response) + self.add_resolver_headers(request, response) self.add_cors_headers(request, response) return response diff --git a/readthedocs/proxito/tests/test_headers.py b/readthedocs/proxito/tests/test_headers.py index 1b2ce6726b4..f99a6a3dc45 100644 --- a/readthedocs/proxito/tests/test_headers.py +++ b/readthedocs/proxito/tests/test_headers.py @@ -51,10 +51,30 @@ def test_serve_headers(self): self.assertEqual(r["X-RTD-Project-Method"], "public_domain") self.assertEqual(r["X-RTD-Version"], "latest") self.assertEqual(r["X-RTD-version-Method"], "path") + self.assertEqual(r["X-RTD-Resolver-Filename"], "/") self.assertEqual( r["X-RTD-Path"], "/proxito/media/html/project/latest/index.html" ) + def test_serve_headers_with_path(self): + r = self.client.get( + "/en/latest/guides/jupyter/gallery.html", + secure=True, + headers={"host": "project.dev.readthedocs.io"}, + ) + self.assertEqual(r.status_code, 200) + self.assertEqual(r["Cache-Tag"], "project,project:latest") + self.assertEqual(r["X-RTD-Domain"], "project.dev.readthedocs.io") + self.assertEqual(r["X-RTD-Project"], "project") + self.assertEqual(r["X-RTD-Project-Method"], "public_domain") + self.assertEqual(r["X-RTD-Version"], "latest") + self.assertEqual(r["X-RTD-version-Method"], "path") + self.assertEqual(r["X-RTD-Resolver-Filename"], "/guides/jupyter/gallery.html") + self.assertEqual( + r["X-RTD-Path"], + "/proxito/media/html/project/latest/guides/jupyter/gallery.html", + ) + def test_subproject_serve_headers(self): r = self.client.get( "/projects/subproject/en/latest/", diff --git a/readthedocs/proxito/tests/test_old_redirects.py b/readthedocs/proxito/tests/test_old_redirects.py index e2a388ac7c6..a1604668e82 100644 --- a/readthedocs/proxito/tests/test_old_redirects.py +++ b/readthedocs/proxito/tests/test_old_redirects.py @@ -1352,6 +1352,7 @@ def test_redirect_exact_with_wildcard_crossdomain(self): r = self.client.get(url, headers={"host": "project.dev.readthedocs.io"}) self.assertEqual(r.status_code, 302, url) self.assertEqual(r["Location"], expected_location, url) + self.assertNotIn("X-RTD-Resolver-Filename", r.headers) def test_redirect_html_to_clean_url_crossdomain(self): """ diff --git a/readthedocs/proxito/views/serve.py b/readthedocs/proxito/views/serve.py index 5bf63950bc5..4f44d51fa07 100644 --- a/readthedocs/proxito/views/serve.py +++ b/readthedocs/proxito/views/serve.py @@ -265,6 +265,10 @@ def serve_path(self, request, path): version = unresolved.version filename = unresolved.filename + # Inject the UnresolvedURL into the HttpRequest so we can access from the middleware. + # We could resolve it again from the middleware, but we would duplicating DB queries. + request.unresolved_url = unresolved + # Check if the old language code format was used, and redirect to the new one. # NOTE: we may have some false positives here, for example for an URL like: # /pt-br/latest/pt_BR/index.html, but our protection for infinite redirects @@ -391,7 +395,7 @@ def get(self, request, proxito_path): the Docs default page (Maze Found) is rendered by Django and served. """ log.bind(proxito_path=proxito_path) - log.debug('Executing 404 handler.') + log.debug("Executing 404 handler.") unresolved_domain = request.unresolved_domain # We force all storage calls to use the external versions storage, # since we are serving an external version. @@ -420,6 +424,11 @@ def get(self, request, proxito_path): path=proxito_path, append_indexhtml=False, ) + + # Inject the UnresolvedURL into the HttpRequest so we can access from the middleware. + # We could resolve it again from the middleware, but we would duplicating DB queries. + request.unresolved_url = unresolved + project = unresolved.project version = unresolved.version filename = unresolved.filename @@ -708,11 +717,12 @@ def get(self, request): # Verify if the project is marked as spam and return a custom robots.txt if "readthedocsext.spamfighting" in settings.INSTALLED_APPS: from readthedocsext.spamfighting.utils import is_robotstxt_denied # noqa + if is_robotstxt_denied(project): return render( request, - 'robots.spam.txt', - content_type='text/plain', + "robots.spam.txt", + content_type="text/plain", ) # Use the ``robots.txt`` file from the default version configured @@ -747,33 +757,30 @@ def get(self, request): filename="robots.txt", check_if_exists=True, ) - log.info('Serving custom robots.txt file.') + log.info("Serving custom robots.txt file.") return response except StorageFileNotFound: pass # Serve default robots.txt - sitemap_url = '{scheme}://{domain}/sitemap.xml'.format( - scheme='https', + sitemap_url = "{scheme}://{domain}/sitemap.xml".format( + scheme="https", domain=project.subdomain(), ) context = { - 'sitemap_url': sitemap_url, - 'hidden_paths': self._get_hidden_paths(project), + "sitemap_url": sitemap_url, + "hidden_paths": self._get_hidden_paths(project), } return render( request, - 'robots.txt', + "robots.txt", context, - content_type='text/plain', + content_type="text/plain", ) def _get_hidden_paths(self, project): """Get the absolute paths of the public hidden versions of `project`.""" - hidden_versions = ( - Version.internal.public(project=project) - .filter(hidden=True) - ) + hidden_versions = Version.internal.public(project=project).filter(hidden=True) resolver = Resolver() hidden_paths = [ resolver.resolve_path(project, version_slug=version.slug) @@ -846,8 +853,8 @@ def hreflang_formatter(lang): Use hyphen instead of underscore in language and country value. ref: https://en.wikipedia.org/wiki/Hreflang#Common_Mistakes """ - if '_' in lang: - return lang.replace('_', '-') + if "_" in lang: + return lang.replace("_", "-") return lang def changefreqs_generator(): @@ -861,8 +868,8 @@ def changefreqs_generator(): aggressive. If the tag is removed and a branch is created with the same name, we will want bots to revisit this. """ - changefreqs = ['weekly', 'daily'] - yield from itertools.chain(changefreqs, itertools.repeat('monthly')) + changefreqs = ["weekly", "daily"] + yield from itertools.chain(changefreqs, itertools.repeat("monthly")) project = request.unresolved_domain.project public_versions = Version.internal.public( @@ -879,28 +886,34 @@ def changefreqs_generator(): # We want stable with priority=1 and changefreq='weekly' and # latest with priority=0.9 and changefreq='daily' # More details on this: https://github.com/rtfd/readthedocs.org/issues/5447 - if (len(sorted_versions) >= 2 and sorted_versions[0].slug == LATEST and - sorted_versions[1].slug == STABLE): - sorted_versions[0], sorted_versions[1] = sorted_versions[1], sorted_versions[0] + if ( + len(sorted_versions) >= 2 + and sorted_versions[0].slug == LATEST + and sorted_versions[1].slug == STABLE + ): + sorted_versions[0], sorted_versions[1] = ( + sorted_versions[1], + sorted_versions[0], + ) versions = [] for version, priority, changefreq in zip( - sorted_versions, - priorities_generator(), - changefreqs_generator(), + sorted_versions, + priorities_generator(), + changefreqs_generator(), ): element = { - 'loc': version.get_subdomain_url(), - 'priority': priority, - 'changefreq': changefreq, - 'languages': [], + "loc": version.get_subdomain_url(), + "priority": priority, + "changefreq": changefreq, + "languages": [], } # Version can be enabled, but not ``built`` yet. We want to show the # link without a ``lastmod`` attribute - last_build = version.builds.order_by('-date').first() + last_build = version.builds.order_by("-date").first() if last_build: - element['lastmod'] = last_build.date.isoformat() + element["lastmod"] = last_build.date.isoformat() resolver = Resolver() if project.translations.exists(): @@ -915,27 +928,31 @@ def changefreqs_generator(): project=translation, version=translated_version, ) - element['languages'].append({ - 'hreflang': hreflang_formatter(translation.language), - 'href': href, - }) + element["languages"].append( + { + "hreflang": hreflang_formatter(translation.language), + "href": href, + } + ) # Add itself also as protocol requires - element['languages'].append({ - 'hreflang': project.language, - 'href': element['loc'], - }) + element["languages"].append( + { + "hreflang": project.language, + "href": element["loc"], + } + ) versions.append(element) context = { - 'versions': versions, + "versions": versions, } return render( request, - 'sitemap.xml', + "sitemap.xml", context, - content_type='application/xml', + content_type="application/xml", ) def _get_project(self):