diff --git a/handbook/engineering/distribution/index.md b/handbook/engineering/distribution/index.md index 0f8e5f063c6..0796ec49e50 100644 --- a/handbook/engineering/distribution/index.md +++ b/handbook/engineering/distribution/index.md @@ -80,6 +80,7 @@ Go, Docker, Kubernetes * [Tools](./tools/index.md) * Tutorials * [Observability developer guide](observability/index.md) + * [Managed instances](managed-instances/index.md) * [Collecting and inspecting metrics dumps](metrics_dumps.md) * [How to set up a separate website maintained by Sourcegraph](separate_website.md) * [How to simulate k8s admin security restrictions](k8s_admin_custom_policy.md) diff --git a/handbook/engineering/distribution/managed/architecture.excalidraw b/handbook/engineering/distribution/managed/architecture.excalidraw new file mode 100644 index 00000000000..736d33a5b5a --- /dev/null +++ b/handbook/engineering/distribution/managed/architecture.excalidraw @@ -0,0 +1,1534 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://excalidraw.com", + "elements": [ + { + "type": "text", + "version": 754, + "versionNonce": 1662668634, + "isDeleted": false, + "id": "4uCkE4DKKh9L8nHpWcfwL", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 557.1577735520525, + "y": 345.53849355449574, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 281, + "height": 24, + "seed": 2129494945, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Sourcegraph user browser", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 874, + "versionNonce": 771255110, + "isDeleted": false, + "id": "6HSMg4e_wwymNPn-6Sjs-", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 539.5436144930638, + "y": 330.5315149730354, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 318.4691011235955, + "height": 56.26755617977524, + "seed": 1839639745, + "groupIds": [] + }, + { + "type": "text", + "version": 1851, + "versionNonce": 216481313, + "isDeleted": false, + "id": "hX2YmWPcspBPlwjoHu-pH", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1030.2344122296568, + "y": 331.4061411748569, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 352, + "height": 72, + "seed": 1458624449, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "GCP Load Balancer\n- SSL termination\n- VPN-only firewall (optional)", + "baseline": 67, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 2201, + "versionNonce": 114028655, + "isDeleted": false, + "id": "N_hCRuaNVWsdUOVvCLnqr", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1003.4724673773403, + "y": 315.7237603972742, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 399.1265800561798, + "height": 109.30916432584273, + "seed": 488005185, + "groupIds": [] + }, + { + "type": "text", + "version": 1191, + "versionNonce": 136491418, + "isDeleted": false, + "id": "6ZfuI6lBQuAKT3DKUdAQh", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 542.1694266165409, + "y": 147.33746675823977, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 328, + "height": 24, + "seed": 107316961, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Sourcegraph DNS (Cloudflare)", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 1866, + "versionNonce": 97406214, + "isDeleted": false, + "id": "RjKSZsFqhFjeKebMYxnhi", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 530.058475378788, + "y": 127.03077359770197, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 349.3134469696969, + "height": 61.69135952790912, + "seed": 1326391055, + "groupIds": [] + }, + { + "type": "rectangle", + "version": 1560, + "versionNonce": 478893082, + "isDeleted": false, + "id": "cRt7D7xA_4tU_yQ5uUF4a", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 533.8478422619048, + "y": 281.70758928571433, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 332.6190476190477, + "height": 111.09002976190467, + "seed": 25482191, + "groupIds": [] + }, + { + "type": "text", + "version": 875, + "versionNonce": 1133974150, + "isDeleted": false, + "id": "n8jCoa9rZkkz2RQKBAzlA", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 544.3305786115766, + "y": 293.45032391163863, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 281, + "height": 24, + "seed": 1369296225, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Corporate VPN (optional)", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 2232, + "versionNonce": 2040194266, + "isDeleted": false, + "id": "nK54ejM4UVWnqL1VjYl7-", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1507.6753891453604, + "y": 204.57737287730185, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 526.3441421046396, + "height": 224.82545915394698, + "seed": 1808608079, + "groupIds": [] + }, + { + "type": "text", + "version": 1725, + "versionNonce": 1117888815, + "isDeleted": false, + "id": "9dj63SIXdol74rSL0RCpv", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1538.498183238939, + "y": 228.96814052126217, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 176, + "height": 24, + "seed": 1026199663, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "GCP VM (single)", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 2034, + "versionNonce": 1687295814, + "isDeleted": false, + "id": "EllUI8csZnR2UeG84D29h", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1536.477082217129, + "y": 280.1521404938866, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 234, + "height": 120, + "seed": 262718049, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "GCP disk (encrypted)\n\n- Configuration\n- User data\n- Company code", + "baseline": 115, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 630, + "versionNonce": 1057776289, + "isDeleted": false, + "id": "UUO_aO52htoib_8iAdYBb", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 969.2454540134518, + "y": 177.6713243743617, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 1090.7914778047302, + "height": 554.3260273205536, + "seed": 1618172545, + "groupIds": [] + }, + { + "type": "text", + "version": 1176, + "versionNonce": 1176208641, + "isDeleted": false, + "id": "YVl1se-uT8D77qTiTsKgN", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 993.58861001575, + "y": 197.8145113989034, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 410, + "height": 96, + "seed": 25695599, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Private Google Cloud infrastructure\n- Isolated VPC\n- Strict ACLs\n- Your deployment only", + "baseline": 91, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 539, + "versionNonce": 469271302, + "isDeleted": false, + "id": "1pbNy_sXn2vowzfYL7n1X", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1412.5048562122279, + "y": 341.83544073991266, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 47, + "height": 24, + "seed": 2056443823, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "HTTP", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 643, + "versionNonce": 1888071567, + "isDeleted": false, + "id": "frj8w-9fac5aD7bE4TbPg", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1576.0421571098243, + "y": -39.05281467425684, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 117, + "height": 24, + "seed": 1863978127, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "GitHub.com", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 412, + "versionNonce": 830200865, + "isDeleted": false, + "id": "C59mLo0Er7zTq4VYO4GHW", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1803.2817750336037, + "y": 12.3672190020161, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 199, + "height": 24, + "seed": 398686415, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "GitHub Enterprise", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 1016, + "versionNonce": 1346890849, + "isDeleted": false, + "id": "IkjIQk85VrJpM4IP0xAFu", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1770.79249807282, + "y": -102.51295103474911, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 211, + "height": 24, + "seed": 601354369, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Corporate Firewall", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 780, + "versionNonce": 885895169, + "isDeleted": false, + "id": "MeG3x9eL7eECOc6ric2Yk", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1757.9120710784339, + "y": -117.14430147058829, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 296.3088235294143, + "height": 226.33731617647092, + "seed": 1198897889, + "groupIds": [] + }, + { + "type": "rectangle", + "version": 419, + "versionNonce": 203820175, + "isDeleted": false, + "id": "UKh9MX57EgWLCg0qp_iDX", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1786.099882392474, + "y": -5.504284274193537, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 244.03981854838707, + "height": 57.72119603889943, + "seed": 848882095, + "groupIds": [] + }, + { + "type": "rectangle", + "version": 748, + "versionNonce": 2044073185, + "isDeleted": false, + "id": "28gjkZ0bj-uAvAAMi2pbR", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1540.5356083965855, + "y": -115.08754842662864, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 195.17463235294102, + "height": 162.13210092504738, + "seed": 1994172097, + "groupIds": [] + }, + { + "type": "text", + "version": 659, + "versionNonce": 1173022127, + "isDeleted": false, + "id": "_zjjpnyv3-c-4o6WTt_Ze", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1555.7903077561716, + "y": -95.50562836021498, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 141, + "height": 24, + "seed": 323262753, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Access token", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 684, + "versionNonce": 249745601, + "isDeleted": false, + "id": "GiJRoT7jCynEtAsuu6wdt", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1549.6943044354896, + "y": -57.535366263440835, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 176.10887096774206, + "height": 60.94021287950662, + "seed": 902696801, + "groupIds": [] + }, + { + "type": "rectangle", + "version": 904, + "versionNonce": 1753491782, + "isDeleted": false, + "id": "ZcgE6aORC63qaVzNN631u", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1522.85546875, + "y": 270.662353515624, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 257.88537720959664, + "height": 139.53247070312509, + "seed": 1585605089, + "groupIds": [] + }, + { + "type": "rectangle", + "version": 710, + "versionNonce": 1806056737, + "isDeleted": false, + "id": "pomUcW5La6SLOvZ7EDV_u", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1797.1448488157826, + "y": 272.4105011470723, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 220.13608870967738, + "height": 68.4160786290323, + "seed": 249147823, + "groupIds": [] + }, + { + "type": "text", + "version": 616, + "versionNonce": 918577519, + "isDeleted": false, + "id": "uWdfA6g558ZIKsK0r5aIQ", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1821.3991330899767, + "y": 282.6216906632011, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 176, + "height": 48, + "seed": 1102444321, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": " Sourcegraph\nDocker services", + "baseline": 43, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "arrow", + "version": 1708, + "versionNonce": 952774, + "isDeleted": false, + "id": "rn7z6ixRf9k2wzfSrxt96", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 839.2775589968643, + "y": 329.49711976976334, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 0.004091438032787664, + "height": 136.1725237115247, + "seed": 923397921, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + 0.004091438032787664, + -136.1725237115247 + ] + ] + }, + { + "type": "arrow", + "version": 1132, + "versionNonce": 1474895386, + "isDeleted": false, + "id": "mjZTkndcMCcBw8c_IAyXY", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 860.8634366122159, + "y": 366.3525834517046, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 139.161288174716, + "height": 2.343757398200694, + "seed": 1480162561, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + 139.161288174716, + 2.343757398200694 + ] + ] + }, + { + "type": "rectangle", + "version": 838, + "versionNonce": 981075937, + "isDeleted": false, + "id": "YkSkpchlAmTr0d4hdNe7-", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1771.2955062065196, + "y": -65.84362399193517, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 273.31233693073773, + "height": 165.1009769331117, + "seed": 587557807, + "groupIds": [] + }, + { + "type": "text", + "version": 804, + "versionNonce": 1207149231, + "isDeleted": false, + "id": "ij95LukhbY8wFb2TAyNWg", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1786.9901367805207, + "y": -47.98964510199258, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 141, + "height": 24, + "seed": 186362561, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Access token", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 542, + "versionNonce": 1270397318, + "isDeleted": false, + "id": "TonXctYmYAz1n-uGX3OT7", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0.016348048670598914, + "x": 887.8579940025245, + "y": 316.07031249999955, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 59, + "height": 48, + "seed": 1574378543, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "HTTPS\n+SSO", + "baseline": 43, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 412, + "versionNonce": 1638958854, + "isDeleted": false, + "id": "XZ1x4ZD8Vi3nj2DbdEVZT", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1768.3911037692806, + "y": 490.10388553503844, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 246, + "height": 24, + "seed": 283725185, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Automatic GCP backups", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 593, + "versionNonce": 1062754383, + "isDeleted": false, + "id": "Q50D9v1og-cCpwbAUmZwn", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1752.2597212357987, + "y": 469.0696244673294, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 281.4660295758928, + "height": 66.729736328125, + "seed": 1057865633, + "groupIds": [] + }, + { + "type": "arrow", + "version": 190, + "versionNonce": 1645724314, + "isDeleted": false, + "id": "-3_4Jue8vzg8SzC_W_Guq", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1402.8130918560605, + "y": 371.6285511363636, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 93.40660511363649, + "height": 1.768821022727252, + "seed": 215251823, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + 93.40660511363649, + 1.768821022727252 + ] + ] + }, + { + "type": "line", + "version": 61, + "versionNonce": 91592591, + "isDeleted": false, + "id": "KdVoxbUT8RMX5KdqAIwcM", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1780.324337121212, + "y": 409.08522727272725, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 117.7260890151515, + "height": 59.144176136363626, + "seed": 579716161, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + 117.7260890151515, + 59.144176136363626 + ] + ] + }, + { + "type": "arrow", + "version": 566, + "versionNonce": 1396936538, + "isDeleted": false, + "id": "YP7egCVuhk5FSEkjS7ijo", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1771.3164482526881, + "y": 202.81779233870975, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 132.64444724462373, + "height": 189.9849210349463, + "seed": 693077391, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + -132.64444724462373, + -189.9849210349463 + ] + ] + }, + { + "type": "arrow", + "version": 433, + "versionNonce": 1267256838, + "isDeleted": false, + "id": "fTXIxFWfRCw389dJFgOzw", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1772.3405577956992, + "y": 203.64957157258073, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 133.68136760752668, + "height": 145.14957157258073, + "seed": 1991020303, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + 133.68136760752668, + -145.14957157258073 + ] + ] + }, + { + "type": "rectangle", + "version": 405, + "versionNonce": 1644189647, + "isDeleted": false, + "id": "JRYXaAiAomiP_v9PKoqF2", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1262.0839213709698, + "y": -113.38029233870967, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 254.42918346773942, + "height": 123.50050403225802, + "seed": 1717964417, + "groupIds": [] + }, + { + "type": "text", + "version": 188, + "versionNonce": 2125271201, + "isDeleted": false, + "id": "U0aVTtr77_t6fjA9t3Ffp", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1279.3956653225807, + "y": -99.47253024193549, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 223, + "height": 96, + "seed": 1298440353, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Auth / SSO provider\n- Okta\n- GitHub\n- etc.", + "baseline": 91, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "arrow", + "version": 428, + "versionNonce": 2116964550, + "isDeleted": false, + "id": "FZ6nygoAnRw6bArw3tr-s", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1771.797379032258, + "y": 203.7939768145161, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 316.265877016129, + "height": 183.27809979838707, + "seed": 550210607, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + -316.265877016129, + -183.27809979838707 + ] + ] + }, + { + "type": "text", + "version": 706, + "versionNonce": 81837722, + "isDeleted": false, + "id": "RyJXNkZNIblLn01rJ6SA7", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 554.4614009533941, + "y": 554.1326138771192, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 258, + "height": 24, + "seed": 788968399, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Sourcegraph admin team", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 136, + "versionNonce": 731163418, + "isDeleted": false, + "id": "fA_g_BJQ21aMKOpehSRWH", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1021.096663135595, + "y": 552.0721001059321, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 258, + "height": 24, + "seed": 2109791887, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "GCP IAP TCP Forwarding", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 200, + "versionNonce": 95595398, + "isDeleted": false, + "id": "xji9NKvDhZrjpFIlJltwP", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1004.7311970338983, + "y": 532.8949284957628, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 295.0013241525424, + "height": 60.758739406779284, + "seed": 1695983009, + "groupIds": [] + }, + { + "type": "rectangle", + "version": 688, + "versionNonce": 1166249990, + "isDeleted": false, + "id": "Z9fFNuLt9u_C8ChabeeVd", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 542.2868776483049, + "y": 540.8413003177966, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 284.9443855932231, + "height": 51.27118644067806, + "seed": 416885327, + "groupIds": [] + }, + { + "type": "text", + "version": 189, + "versionNonce": 1133780166, + "isDeleted": false, + "id": "sJJXL6V4fgxMXkYRdRfXn", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 846.5839512711859, + "y": 536.034759004237, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 105, + "height": 24, + "seed": 1908241903, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "SSH/HTTPS", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "arrow", + "version": 463, + "versionNonce": 1107213574, + "isDeleted": false, + "id": "8meGAU-DtfmGhEXbdCNQ1", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 825.4974179025423, + "y": 564.1034825211866, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 175.12334480932213, + "height": 0.13949947033916033, + "seed": 1569270255, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + 175.12334480932213, + -0.13949947033916033 + ] + ] + }, + { + "type": "arrow", + "version": 289, + "versionNonce": 568249089, + "isDeleted": false, + "id": "-397p3FPY7vmDqAhTg4Pa", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1293.680879237288, + "y": 564.7039194915254, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 220.46477754237276, + "height": 121.35195974576271, + "seed": 939468111, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + 220.46477754237276, + -121.35195974576271 + ] + ] + }, + { + "type": "text", + "version": 313, + "versionNonce": 551413839, + "isDeleted": false, + "id": "cSGVwHquz5zegCc3a_O7z", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 5.817199953355827, + "x": 1370.3215042372876, + "y": 478.43882415254234, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 35, + "height": 24, + "seed": 1644124321, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "SSH", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 339, + "versionNonce": 914983878, + "isDeleted": false, + "id": "i_DZWT6FjOENkTehHLqij", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 555.7269597457625, + "y": 645.4882150423732, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 305, + "height": 72, + "seed": 809944591, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "Sourcegraph infrastructure\n\nAutomation / Terraform", + "baseline": 67, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 321, + "versionNonce": 2038070170, + "isDeleted": false, + "id": "e1WMWeda-EKnTy8xX_fBt", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 542.2643008474574, + "y": 632.9381620762715, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 328.09189618644075, + "height": 98.589777542373, + "seed": 1478595759, + "groupIds": [] + }, + { + "type": "text", + "version": 173, + "versionNonce": 1579415750, + "isDeleted": false, + "id": "Q8DzS39tf6bla4zynya7c", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1019.7624470338965, + "y": 639.792439088983, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 293, + "height": 48, + "seed": 2036119631, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "GCP Compute Engine API\nCloud IAM service account", + "baseline": 43, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "rectangle", + "version": 215, + "versionNonce": 13196954, + "isDeleted": false, + "id": "Nwt2jfc1fgQMRs4CNKIMS", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1005.9870233050838, + "y": 621.4725238347457, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 320.9017478813561, + "height": 85.48728813559259, + "seed": 234611407, + "groupIds": [] + }, + { + "type": "arrow", + "version": 319, + "versionNonce": 1393865370, + "isDeleted": false, + "id": "mD54qVysvwT72l3KmEd1A", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 874.1726694915254, + "y": 665.7701933262712, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 128.16810116525414, + "height": 1.5532971398305335, + "seed": 855920527, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + 128.16810116525414, + 1.5532971398305335 + ] + ] + }, + { + "type": "arrow", + "version": 194, + "versionNonce": 1487750945, + "isDeleted": false, + "id": "vcJ7z2-OLn4mb-TRG9wrK", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 1322.203125, + "y": 666.2995233050848, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 215.41975635593212, + "height": 215.4859639830509, + "seed": 157345441, + "groupIds": [], + "points": [ + [ + 0, + 0 + ], + [ + 215.41975635593212, + -215.4859639830509 + ] + ] + }, + { + "type": "text", + "version": 186, + "versionNonce": 1786921377, + "isDeleted": false, + "id": "JcqryEAoUnRIpHY2Lu645", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 5.487010575058042, + "x": 1378.8723516949142, + "y": 546.950476694915, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 59, + "height": 24, + "seed": 1615498767, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "HTTPS", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 81, + "versionNonce": 1179010310, + "isDeleted": false, + "id": "Is56D2S3fP6_qgdiFoUVO", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 890.750993114407, + "y": 640.1981594279664, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 59, + "height": 24, + "seed": 840748495, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "HTTPS", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 920, + "versionNonce": 1894612358, + "isDeleted": false, + "id": "H-r6Kx5ac7fE1nNYNLvvn", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0.9580467719359387, + "x": 1692.4965950363203, + "y": 69.65112363801455, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 59, + "height": 48, + "seed": 411023457, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "HTTPS\nGit", + "baseline": 43, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 957, + "versionNonce": 306723098, + "isDeleted": false, + "id": "B6OfyCjcoOFu8hRSYDOV-", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0.5817071822522202, + "x": 1590.2247276029057, + "y": 83.7688029661017, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 59, + "height": 24, + "seed": 635233839, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "HTTPS", + "baseline": 19, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 1370, + "versionNonce": 1304294022, + "isDeleted": false, + "id": "pLTZ14gbGb9PDkzlR7oqQ", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 5.44253378531101, + "x": 1831.7279528601728, + "y": 121.1619911470944, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 59, + "height": 48, + "seed": 1312710735, + "groupIds": [], + "fontSize": 20, + "fontFamily": 3, + "text": "HTTPS\nGit", + "baseline": 43, + "textAlign": "left", + "verticalAlign": "top" + }, + { + "type": "text", + "version": 478, + "versionNonce": 70217734, + "isDeleted": false, + "id": "WxA_NVDjuEJwXWT0b2qgd", + "fillStyle": "hachure", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": 533.3430311743339, + "y": -106.01138771186433, + "strokeColor": "#000000", + "backgroundColor": "transparent", + "width": 612, + "height": 86, + "seed": 1703507829, + "groupIds": [], + "fontSize": 36, + "fontFamily": 3, + "text": "Sourcegraph\nmanaged instance architecture", + "baseline": 78, + "textAlign": "left", + "verticalAlign": "top" + } + ], + "appState": { + "viewBackgroundColor": "#ffffff", + "gridSize": null + } +} \ No newline at end of file diff --git a/handbook/engineering/distribution/managed/cost_estimation.md b/handbook/engineering/distribution/managed/cost_estimation.md new file mode 100644 index 00000000000..011a98d7b32 --- /dev/null +++ b/handbook/engineering/distribution/managed/cost_estimation.md @@ -0,0 +1,38 @@ +# Managed instances: cost estimation + +The following describes how to **estimate** the infrastructure costs incurred for a managed instance. Managed instances are always created in completely isolated GCP projects, and as such it is easy to see the exact infrastructure cost breakdown for a given customer. + +| Cost estimate | Description | +|--------------------------------|------------------------------------------------------------------------------------------------------------------------------| +| $388/mo min. varies | [VM instance](https://cloud.google.com/compute/vm-instance-pricing#n1_standard_machine_types), `n1-standard-16` min. in `us-central`. Consult [resource estimator](https://docs.sourcegraph.com/admin/install/resource_estimator) | +| $85/mo min. / $0.340/GB | Data disk, SSD/regional, 250G minimum - exact size depends on customer's repository sizes. | +| $34/mo | Boot disk, SSD/regional, 100G fixed size. | +| $85/mo / $0.026/GB | 13 weekly snapshots (taken weekly, retained for 90d) | +| $78/mo / $0.026/GB | 12 monthly upgrade data disk snapshots (taken once per month as part of upgrade, retained for 1yr). | +| $36/mo | [Cloud Load Balancing](https://cloud.google.com/vpc/network-pricing#lb), two rules required. | +| $12/mo | [External IP address](https://cloud.google.com/vpc/network-pricing#ipaddress), four required. | +| $1/mo + $0.045/GB | [Cloud NAT](https://cloud.google.com/vpc/network-pricing#nat-pricing), one required. | +| $6/mo + $0.75/million requests | [Cloud Armor](https://cloud.google.com/vpc/network-pricing#armor-pricing), one policy, one rule, and user-activity requests. | +| $1.42/GB | [Cloud Proxy](https://cloud.google.com/vpc/network-pricing#proxy-instance-charge), 3 are required. | +| $0.12/GB | [Internet egress traffic](https://cloud.google.com/vpc/network-pricing#internet_egress) from `us-central1`. | + +Or, more succinctly: + +| Cost estimate | Description | +|----------------|--------------------------------------------------------------------------| +| $282/mo | **data and snapshot storage costs for the first 250GB** | +| $56/mo | **network infrastructure costs** | +| +$388/mo min. | **n1-standard-16, or your VM instance type based on resource estimator** | +| +$1/GB storage | **For any additional storage above 250GB required.** | +| +$1.54/GB | **For each GB of network traffic to/from the instance.** | + +Thus, the smallest managed instance supporting around ~300 repositories and ~100 users costs around $726/mo. + +**IMPORTANT:** This is a _rough estimate_ to the best of our ability, infrastructure pricing estimates are not trivial to do extremely accurately and actual costs are always subject to change. The goal of this is to give you a rough estimate of costs, not an exact amount. + +### Cost savings + +The above does not take into account: + +- Potential committed use discounts, which are handled transparently on Sourcegraphs' side through [cross-project committed use discount sharing](https://cloud.google.com/compute/docs/instances/signing-up-committed-use-discounts#sharing_committed_use_discounts_across_projects) and passed onto you automatically. +- Cost reduction based on proactive monitoring of services based on your actual usage - resource requirements of Sourcegraph vary widely depending on how your usage looks. This is handled automatically by Sourcegraph as part of monthly upgrades. diff --git a/handbook/engineering/distribution/managed/creation_process.md b/handbook/engineering/distribution/managed/creation_process.md new file mode 100644 index 00000000000..88c330ded3e --- /dev/null +++ b/handbook/engineering/distribution/managed/creation_process.md @@ -0,0 +1,63 @@ +# Creating a managed instance + +Creating a new managed instance involves following the steps below. + +1. Ask @stephen or @beyang to create a new GCP project `sourcegraph-managed-$COMPANY` and grant you IAM **Editor** role access. +1. Ask @beyang to enable billing in the GCP project. +1. Create GCP service account credentials: + - From console.cloud.google.com select the project > **APIs & Services** > **Credentials** > **Create credentials** > **Service account** + - Service account name: `deploy` + - Service account description: blank + - On the **Service account permissions (optional)** page add the **Compute Admin**, **Storage admin**. + - **Done** > ignore **Grant users access to this service account (optional)** and choose **Done** + - Select **Edit** (pencil) on the service account we just created + - **Add key** > **Create new key** > **JSON** > **Create** +1. Upload the service account key and create admin credentials in 1password: + - Open the 1password [Managed instances vault](https://my.1password.com/vaults/l35e5xtcfsk5suuj4vfj76hqpy/allitems) (ask @stephen, @gonza, or @beyang to grant you access) + - **Add** > **Document** > enter **$COMPANY service account** as the title > Upload the service account JSON file previously downloaded > **Save** + - **Add** > **Password** > enter **$COMPANY sourcegraph-admin** as the title > Change **length** to 40 and turn on symbols and digits >> **Save** +1. In GCP, enable the **Compute Engine API**: + - Under **APIs & Services** > **Library** search for "Compute" + - Select **Compute Engine API** and choose **Enable** +1. `export GOOGLE_APPLICATION_CREDENTIALS=~/Downloads/sourcegraph-managed-company-220df65550d4.json` +1. Clone and `cd deploy-sourcegraph-managed/` +1. `VERSION=v3.17.2 ./create-deployment.sh $COMPANY/` and **commit the result.** +1. Open and edit `deploy-sourcegraph-managed/$COMPANY/gcp-tfstate/gcp-tfstate.tf` according to the comments within, commit the result. +1. In `gcp-tfstate` run `terraform init && terraform apply && git add . && git commit -m 'initialize GCP tfstate bucket'` +1. Open and edit `infrastructure.tf` according to the comments within and commit the result. +1. In `deploy-sourcegraph-managed/$COMPANY` run `terraform init && terraform plan && terraform apply` +1. Access the instance over SSH and confirm all containers are healthy (instructions below). +1. In the infrastructure repository, [create a DNS entry](https://github.com/sourcegraph/infrastructure/blob/master/dns/sourcegraph-managed.tf) that points `$COMPANY.sourcegraph.com` to the `default-global-address` IP (see "Finding the external load balancer IP" below) and follow the process there to `asdf exec terraform apply` it. +1. Confirm all containers come up healthy (`docker ps` should report them as such) +1. Create a PR for review. +1. Access the Sourcegraph web UI (instructions for port-forwarding below) +1. Navigate to Grafana and confirm the instance looks healthy. +1. Set up the initial admin account (for use by the Sourcegraph team only) + - Email: `managed+$COMPANY@sourcegraph.com` (note `+` sign not `-`) + - Username: `sourcegraph-admin` + - Password: Use the password previously created and stored in 1password. +1. Configure `externalURL` in the site configuration, and use SSH to restart the server (`sudo su`, `shutdown -r`) wait for it to come back up and access it again. +1. In the global user settings, set `"alerts.showPatchUpdates": false` +1. In the GCP web UI under **Network services** > **Load balancers** > select the load balancer > watch the SSL certificate status. It may take some time for it to become active (~1h41m) / for Google to see the DNS change from Cloudflare. Confirm it is active by following ["Access through the GCP load balancer as a user would"](#access-through-the-gcp-load-balancer-as-a-user-would). +1. In the site configuration, configure alerting to go to our `#alerts-managed-instances` Slack channel, for example (replace `$COMPANY` with the actual company name, and `$WEBHOOK_URL` with the actual webhook URL from 1password **Managed instances** > `#alerts-managed-instances Slack webhook URL`): + +``` + "observability.alerts": [ + { + "level": "critical", + "notifier": { + "type": "slack", + "username": "$COMPANY", + "url": "$WEBHOOK_URL" + } + }, + { + "level": "warning", + "notifier": { + "type": "slack", + "username": "$COMPANY", + "url": "$WEBHOOK_URL" + } + } + ], +``` diff --git a/handbook/engineering/distribution/managed/index.md b/handbook/engineering/distribution/managed/index.md new file mode 100644 index 00000000000..6753a867a9c --- /dev/null +++ b/handbook/engineering/distribution/managed/index.md @@ -0,0 +1,118 @@ +# Managed instances + +This documentation details how the Distribution team at Sourcegraph internally handles the provisioning/creation/configuration/maintenance of [managed instances](https://docs.sourcegraph.com/admin/install/managed). + +Please first read [the customer-facing managed instance documentation](https://docs.sourcegraph.com/admin/install/managed) to understand what these are and what we provide. + +- [Technical details](#technical-details) + - [Deployment type and scaling](#deployment-type-and-scaling) + - [Known limitations of managed instances](#known-limitations-of-managed-instances) + - [Security](#security) +- [Cost estimation](cost_estimation.md) +- [Creating a managed instance](creation_process.md) +- [Operations](operations.md) + - [Red/black deployment model](operations.md#red-black-deployment-model) + - [SSH access](operations.md#ssh-access) + - [Port-forwarding (direct access to Caddy, Jaeger, and Grafana)](operations.md#port-forwarding-direct-access-to-caddy-jaeger-and-grafana) + - [Access through the GCP load balancer as a user would](operations.md#access-through-the-gcp-load-balancer-as-a-user-would) + - [Accessing the Docker containers](operations.md#accessing-the-docker-containers) + - [Finding the external IPs](operations.md#finding-the-external-ips) + - [Impact of recreating the instance via Terraform](operations.md#impact-of-recreating-the-instance-via-terraform) + - [Instance is recreated when startup script changes](operations.md#instance-is-recreated-when-startup-script-changed) + - [Debugging startup scripts](operations.md#debugging-startup-scripts) +- [Upgrading a managed instance](upgrade_process.md) + - [1) Add a banner indicating scheduled maintenance is in progress](upgrade_process.md#1-add-a-banner-indicating-scheduled-maintenance-is-in-progress) + - [2) Mark the database as ready-only](upgrade_process.md#2-mark-the-database-as-ready-only) + - [3) Create a snapshot of the current deployment](upgrade_process.md#3-create-a-snapshot-of-the-current-deployment) + - [4) Initialize the new production deployment](upgrade_process.md#4-initialize-the-new-production-deployment) + - [5) Make the database on the new deployment writable](upgrade_process.md#5-make-the-database-on-the-new-deployment-writable) + - [6) Upgrade the new deployment](upgrade_process.md#6-upgrade-the-new-deployment) + - [7) Recreate the deployment](upgrade_process.md#7-recreate-the-deployment) + - [8) Confirm instance health](upgrade_process.md#8-confirm-instance-health) + - [9) Switch the load balancer target](upgrade_process.md#9-switch-the-load-balancer-target) + - [10) Remove the banner indicating scheduled maintenance is in progress](upgrade_process.md#10-remove-the-banner-indicating-scheduled-maintenance-is-in-progress) + - [11) Take down the old deployment](upgrade_process.md#11-take-down-the-old-deployment) +- [FAQ](#faq) + - [FAQ: Why did we choose Docker Compose over Kubernetes deployments?](#faq-why-did-we-choose-docker-compose-over-kubernetes-deployments) + +## Technical details + +![architecture](https://storage.googleapis.com/sourcegraph-assets/managed-instance-architecture.png) + +### Deployment type and scaling + +Managed instances are Docker Compose deployments only today. We do not currently offer Kubernetes managed instances. + +These managed Docker Compose deployments can scale up to the largest GCP instance type available, n1-standard-96 with 96 CPU / 360 GB memory which is typically enough for most medium to large enterprises. + +We do not offer Kubernetes managed instances today as this introduces some complexity for us in terms of ongoing maintenance and overhead, we may revisit this decision in the future. + +### Known limitations of managed instances + +Sourcegraph managed instances are single-machine Docker-Compose deployments only. We do not offer Kubernetes managed instances, or multi-machine deployments, today. + +With that said, Docker Compose deployments can scale up to the largest GCP instance type available, n1-standard-96 with 96 CPU & 360 GB memory, and are typically capable of supporting all but the largest of enterprises (around 25,000 repositories and 3,000 users are supported, based on what we have seen thus far.) + +The main limitation of this model is that an underlying GCP infrastructure outage could result in downtime, i.e. is it not a HA deployment. + +See also: [Dev FAQ: Why did we choose Docker Compose over Kubernetes deployments?](#dev-faq-why-did-we-choose-docker-compose-over-kubernetes-deployments) + +### Security + +- **Isolation**: Each managed instance is created in an isolated GCP project with heavy gcloud access ACLs and network ACLs for security reasons. +- **Admin access**: Both the customer and Sourcegraph personnel will have access to an application-level admin account. +- **VM/SSH access**: Only Sourcegraph personnel will have access to the actual GCP VM, this is done securely through GCP IAP TCP proxy access only. Sourcegraph personnel can make changes or provide data from the VM upon request by the customer. +- **Inbound network access**: The customer may choose between having the deployment be accessible via the public internet and protected by their SSO provider, or for additional security have the deployment restricted to an allowlist of IP addresses only (such as their corporate VPN, etc.) +- **Outbound network access**: The Sourcegraph deployment will have unfettered egress TCP/ICMP access, and customers will need to allow the Sourcegraph deployment to contact their code host. This can be done by having their code-host be publicly accessible, or by allowing the static IP of the Sourcegraph deployment to access their code host. + +### Configuration management + +Terraform is used to maintain all managed instances. You can find this configuration here: https://github.com/sourcegraph/deploy-sourcegraph-managed + +All customer credentials, secrets, site configuration, app and user configuration - is stored in Postgres only (i.e. on the encrypted GCP disk). This allows customers to enter their access tokens, secrets, etc. directly into the app through the web UI without transferring them to us elsewhere. + +## FAQ + +### FAQ: Can customers disable the "Builtin username-password authentication"? + +No, this is required in order for Sourcegraph to access the instance and debug issues through the initial admin account. + +However, it does not need to be used by the customer or their users at all. The default login method can be configured to their SSO provider of choice. + +### FAQ: Why did we choose Docker Compose over Kubernetes deployments? + +Managed instances is an interim solution to having the Sourcegraph.com Cloud natively support multi-tenant private repositories. The thinking has been that: + +1. Managed instances are a good interim solution because Sourcegraph.com Cloud native support for private repositories is several quarters out, at least. +2. Managed instances are something we can not invest in at all until we see customer traction, and we can tailor how much we invest based on customer traction. +3. Managed instances are something with little ongoing maintenance burden to us as long as we keep them simple, and automate as much as possible when we begin to see traction. + +Because of these reasons, and due to this being an interrupt to our regularly scheduled work, a few things with Kubernetes managed instances did not make sense at our current point in time: + +- We would need to set up multi-machine backup infrastructure and processes for restoration - substantially harder to do on a moments notice. +- Managing network ACLs in a Kubernetes deployment was substantially harder to do: + - Need static NAT IPs for prospective customers to allow access to their code host which may be behind a corporate firewall. + - Need to integrate something like GCP load balancer for SSL termination _with_ IP allow-listing (to restrict access to customer's VPN only) +- Performing upgrades in Kubernetes deployments is substantially more time consuming: + - Being confident about addressing merge conflicts on upgrades and having them not result in unwanted changes has been difficult for us and customers. + - We do not have proper automation in place for Kubernetes upgrades, and automating Docker Compose upgrades seemed much more straightforward. + +Additionally, we noted the following: + +- Docker-Compose deployments have suited companies up to 25k repos & ~3000 devs well in the past (and, frankly, beyond that..) +- We have a proper production Kubernetes deployment (sourcegraph.com) but lacked a proper Docker-Compose production deployment - this could act as that and ensure we do even more proper diligence with that deployment type and keep it functioning well for the many customers out there running it today. +- For the first customer requesting managed instances, we had a spin-up time of approx ~3d only. + +None of this is to say that we will not consider switching said managed instances to Kubernetes in the future under different circumstances - it is just to say we are not doing that today. + +### FAQ: "googleapi: Error 400: The network_endpoint_group resource ... is already being used" + +If `terraform apply` is giving you: + +``` +Error: Error when reading or editing NetworkEndpointGroup: googleapi: Error 400: The network_endpoint_group resource 'projects/sourcegraph-managed-$COMPANY/zones/us-central1-f/networkEndpointGroups/default-neg' is already being used by 'projects/sourcegraph-managed-$COMPANY/global/backendServices/default-backend-service', resourceInUseByAnotherResource +``` + +Or similar - this indicates a bug in Terraform where GCP requires an associated resource to be deleted first and Terraform is trying to delete (or create) that resource in the wrong order. + +To workaround the issue, locate the resource in GCP yourself and delete it manually and then `terraform apply` again. diff --git a/handbook/engineering/distribution/managed/operations.md b/handbook/engineering/distribution/managed/operations.md new file mode 100644 index 00000000000..1a231b1d346 --- /dev/null +++ b/handbook/engineering/distribution/managed/operations.md @@ -0,0 +1,139 @@ +# Managed instances operations + +- [Red/black deployment model](#red-black-deployment-model) +- [SSH access](#ssh-access) +- [Port-forwarding (direct access to Caddy, Jaeger, and Grafana)](#port-forwarding-direct-access-to-caddy-jaeger-and-grafana) +- [Access through the GCP load balancer as a user would](#access-through-the-gcp-load-balancer-as-a-user-would) +- [Accessing the Docker containers](#accessing-the-docker-containers) +- [Finding the external IPs](#finding-the-external-ips) +- [Impact of recreating the instance via Terraform](#impact-of-recreating-the-instance-via-terraform) +- [Instance is recreated when startup script changes](#instance-is-recreated-when-startup-script-changed) +- [Debugging startup scripts](#debugging-startup-scripts) + +## Red/black deployment model + +At any point in time only one deployment is the active production instance, this is noted in `deploy-sourcegraph-managed/$CUSTOMER/terraform.tfvars`, and except during upgrades only one is running. You can see this via: + +```sh +$ gcloud compute instances list --project=sourcegraph-managed-$COMPANY +NAME ZONE MACHINE_TYPE PREEMPTIBLE INTERNAL_IP EXTERNAL_IP STATUS +default-red-instance us-central1-f n1-standard-8 10.2.0.2 RUNNING +``` + +During an upgrade, both `default-red-instance` and `default-black-instance` would be present. One being production, the other being the "new" upgraded production instance. When the upgrade is complete, the old one is turned off (red/black swap). + +## SSH access + +Locate the GCP instance you'd like to access (usually either `default-red-instance` or `default-black-instance`), and then: + +```sh +$ gcloud beta compute ssh --zone "us-central1-f" --tunnel-through-iap --project "sourcegraph-managed-$COMPANY" default-red-instance +``` + +If you get an error: + +```sh +ERROR: (gcloud.beta.compute.start-iap-tunnel) Error while connecting [4003: u'failed to connect to backend']. +ssh_exchange_identification: Connection closed by remote host +ERROR: (gcloud.beta.compute.ssh) [/usr/bin/ssh] exited with return code [255]. +``` + +This may be indicating that the VM is currently not running - check: + +```sh +$ gcloud beta compute instances list --project=sourcegraph-managed-$COMPANY +``` + +And start the instance if needed (e.g. through the web UI.) + +## Port-forwarding (direct access to Caddy, Jaeger, and Grafana) + +Locate the GCP instance you'd like to access (usually either `default-red-instance` or `default-black-instance`), and then: + +```sh +gcloud compute start-iap-tunnel default-red-instance 80 --local-host-port=localhost:4444 --zone "us-central1-f" --project "sourcegraph-managed-$COMPANY" +``` + +This will port-forward `localhost:4444` to port `80` on the VM instance: + +Replace `80` with `3370` for Grafana, or `16686` for Jaeger. Note that other ports are prevented by the `allow-iap-tcp-ingress` firewall rule. + +## Access through the GCP load balancer as a user would + +Users access Sourcegraph through GCP Load Balancer/HTTPS -> the Caddy load balancer/HTTP -> the actual sourcegraph-frontend/HTTP. If you suspect that an issue is being introduced by the GCP load balancer itself, e.g. perhaps a request is timing out there due to some misconfiguration, then you will need to access through the GCP load balancer itself. If the managed instance is protected by the load balancer firewall / not publicly accessible on the internet, then it is not possible for you to access `$COMPANY.sourcegraph.com` as a normal user would. + +You can workaround this by proxying your internet traffic through the instance itself - which is allowed to reach and go through the public internet -> the GCP load balancer -> back to the instance itself. To do this, create a SOCKS5 proxy tunnel over SSH (replace `default-red-instance`, if needed): + +```ssh +bash -c '(gcloud beta compute ssh --zone "us-central1-f" --tunnel-through-iap --project "sourcegraph-managed-$COMPANY" default-red-instance -- -N -p 22 -D localhost:5000) & sleep 600; kill $!' +``` + +Then test you can access it using `curl`: + +``` +$ curl --proxy socks5://localhost:5000 https://$COMPANY.sourcegraph.com +Found. +``` + +You can now reproduce the request using `curl`, or configure your OS or browser to use `socks5://localhost:5000`: + +* Windows: Follow the “using the SOCKS proxy” section in [this article](https://www.ocf.berkeley.edu/~xuanluo/sshproxywin.html) [[mirror]](https://web.archive.org/web/20160609073255/https://www.ocf.berkeley.edu/~xuanluo/sshproxywin.html) to enable it on Internet Explorer, Edge and Firefox. +* macOS: System Preferences → Network → Advanced → Proxies → check “SOCKS proxy” and enter the host and the port. +* Linux: Most browsers have proxy settings in their Settings/Preferences. +* Command-line apps: Many CLIs accept http_proxy or https_proxy environment variables or arguments you can set the proxy. Consult the help or the manpage of the program. + +**IMPORTANT**: Once you are finished, terminate the original `gcloud beta compute ssh` command so your machine's traffic is no longer going over the instance. The command above will automatically terminate after 10 minutes, to prevent this. + +## Accessing the Docker containers + +Access the deployment over SSH (instructions above) and then: + +```sh +sudo su +docker ps +``` + +You can then use regular Docker commands (e.g. `docker exec -it $CONTAINER sh`) to interact with the containers. + +## Finding the external IPs + +```sh +$ gcloud compute addresses list --project=sourcegraph-managed-$COMPANY +NAME ADDRESS/RANGE TYPE PURPOSE NETWORK REGION SUBNET STATUS +default-global-address $GLOBAL_IP EXTERNAL IN_USE +default-nat-manual-ip-0 $NAT_IP_ONE EXTERNAL us-central1 IN_USE +default-nat-manual-ip-1 $NAT_IP_TWO EXTERNAL us-central1 IN_USE +``` + +- `$GLOBAL_IP` is the IP address that `$COMPANY.sourcegraph.com` should point to, it is the address of the GCP Load Balancer. +- `$NAT_IP_ONE` and `$NAT_IP_TWO` are the external IPs from which egress traffic from the deployment will originate from. These are the addresses from which Sourcegraph will access the customer's code host, and as such the customer will need to allow them access to e.g. their internal code host. + +## Impact of recreating the instance via Terraform + +All configuration about the instance itself is stored in Terraform, so recreating the instance is a non-destructive action. A brand new VM will be provisioned by Terraform, the startup script will initialize it and mount the existing data disk back into the VM, and the Sourcegraph Docker containers will start up. + +This typically involves around 8m40s of downtime: 6m destroying the network endpoint group, and 2m creating the new instance / installing software / launching Docker services. + +## Instance is recreated when startup script changes + +Any time a startup script is changed, Terraform will plan to delete the old VM instance and recreate it such that the script runs again. + +This is a non-destructive action, aside from the fact that it involves downtime for the deployment. + +## Debugging startup scripts + +View startup script logs + +``` +cat /var/log/syslog | grep startup-script +``` + +Run startup script and debug: + +``` +sudo google_metadata_script_runner --script-type startup --debug +``` + +WARNING: Running our startup script twice is a potentially harmful action, as it is usually only ran once. + +More details: https://cloud.google.com/compute/docs/startupscript diff --git a/handbook/engineering/distribution/managed/upgrade_process.md b/handbook/engineering/distribution/managed/upgrade_process.md new file mode 100644 index 00000000000..306193f2085 --- /dev/null +++ b/handbook/engineering/distribution/managed/upgrade_process.md @@ -0,0 +1,172 @@ +# Upgrading a managed instance + +- [1) Add a banner indicating scheduled maintenance is in progress](#1-add-a-banner-indicating-scheduled-maintenance-is-in-progress) +- [2) Mark the database as ready-only](#2-mark-the-database-as-ready-only) +- [3) Create a snapshot of the current deployment](#3-create-a-snapshot-of-the-current-deployment) +- [4) Initialize the new production deployment](#4-initialize-the-new-production-deployment) +- [5) Make the database on the new deployment writable](#5-make-the-database-on-the-new-deployment-writable) +- [6) Upgrade the new deployment](#6-upgrade-the-new-deployment) +- [7) Recreate the deployment](#7-recreate-the-deployment) +- [8) Confirm instance health](#8-confirm-instance-health) +- [9) Switch the load balancer target](#9-switch-the-load-balancer-target) +- [10) Remove the banner indicating scheduled maintenance is in progress](#10-remove-the-banner-indicating-scheduled-maintenance-is-in-progress) +- [11) Take down the old deployment](#11-take-down-the-old-deployment) + +## 1) Add a banner indicating scheduled maintenance is in progress + +Add to the global user settings: + +```jsonc + "notices": [ + { + "dismissible": false, + "location": "top", + "message": "🚀 Sourcegraph is undergoing a scheduled upgrade ([what's changed?](https://about.sourcegraph.com/blog/)). You may be unable to perform some write actions during this time, such as updating your user settings." + } + ] +``` + +## 2) Mark the database as ready-only + +Mark the database as read-only: + +```sh +docker exec -it pgsql psql -U sg -c 'ALTER DATABASE sg SET default_transaction_read_only = true;' +``` + +During this time searching will still work, but the site will refuse all database writes, e.g. this will show up in the web UI as: + +> [...]: pq: cannot execute INSERT in a read-only transaction + +During this time: + +- Repositories will not update +- Authentication permissions will not synchronize +- LSIF precise code intel cannot be uploaded +- User settings and site configuration cannot be updated +- Extensions cannot be installed + +## 3) Create a snapshot of the current deployment + +Edit `deploy-sourcegraph-managed/$CUSTOMER/terraform.tfvars` to create a snapshot of the current deployment: + +```diff + load_balancer_target = "red" + deployments = ["red"] + disks = { + red = { from_snapshot = null } + } ++snapshots = { ++ upgrade-from-3-17-2 = { from_disk = "default-red-data-disk" } ++} +``` + +Make sure the version (`3-17-2`) describes the current version of the instance, and `from_disk` is correct, then `terraform apply` it to create the snapshot. This can take anywhere from a minute to several minutes, depending on how large the disk is. + +## 4) Initialize the new production deployment + +Initialize the new `black` production deployment using the snapshot created in the previous step, by editing `deploy-sourcegraph-docker/$CUSTOMER/terraform.tfvars` with something like: + +```diff + load_balancer_target = "red" +-deployments = ["red"] ++deployments = ["red", "black"] + disks = { + red = { from_snapshot = null } + black = { from_snapshot = "default-red-data-disk-snapshot--upgrade-from-3-17-2" } + } + snapshots = { + upgrade-from-3-17-2 = { from_disk = "default-red-data-disk" } + } +``` + +Make sure to use the right `red` / `black` in both the new deployment name and the `from_snapshot` value. Similarly, change the `from_snapshot` value to match the snapshot you previously took. + +Then in `deploy-sourcegraph-docker/$CUSTOMER` copy the `red` deployment's Docker Compose configuration: + +```sh +cp -R red/ black/ +git add black/ +git commit -m 'cp -R red/ black/' +``` + +Then `terraform apply` to update the metadata. This should only modify the deployment - not recreate it. + +## 5) Make the database on the new deployment writable + +On the _new_ deployment to be upgraded, make the database writable: + +```sh +docker exec -it pgsql psql -U sg -c 'set transaction read write; ALTER DATABASE sg SET default_transaction_read_only = false;' +``` + +## 6) Upgrade the new deployment + +First check the new version at https://docs.sourcegraph.com/admin/updates/docker_compose requires no manual migration steps. + +Then, to upgrade the new `black` deployment to 3.18.0: + +```sh +cd deploy-sourcegraph-docker/$CUSTOMER +VERSION=v3.18.0 ../update-docker-compose.sh black/ +# Address any merge conflicts in black/ if needed. +git add black/ +git commit -m 'update to v3.18.0' +``` + +Then `terraform apply`. + +## 7) Recreate the deployment + +Take down the new `black` deployment: + +```diff + load_balancer_target = "red" +-deployments = ["red", "black"] ++deployments = ["red"] +``` + +`terraform apply` and recreate it (so the startup script runs on a clean OS disk): + +```diff + load_balancer_target = "red" +-deployments = ["red"] ++deployments = ["red", "black"] +``` + +## 8) Confirm instance health + +Access Grafana and confirm the instance is healthy. + +## 9) Switch the load balancer target + +Connect to the new instance using the SOCKS5 proxy and confirm you can access it and view the old version at https://company.sourcegraph.com/site-admin/updates + +Change the `load_balancer_target` in `terraform.tfvars` from the old deployment (e.g. `"red"`) to the new deployment (e.g. `"black"`), and `terraform apply`. During this time users will see 500 errors: + +> Error: Server Error +> The server encountered a temporary error and could not complete your request. +> Please try again in 30 seconds. + +For 1m35s while the `google_compute_network_endpoint` gets destroyed. + +## 10) Remove the banner indicating scheduled maintenance is in progress + +Remove the notice previously added to the global user settings. + +## 11) Take down the old deployment + +Remove the old `red` deployment and its data disk: + +```diff + load_balancer_target = "red" +-deployments = ["red", "black"] ++deployments = ["black"] + disks = { +- red = { from_snapshot = null } + black = { from_snapshot = "default-red-data-disk-snapshot--upgrade-from-3-17-2" } + } +``` + +And `terraform apply` it. +