diff --git a/CHANGELOG.md b/CHANGELOG.md
index 482e4684..c50cad3a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,26 @@ All notable changes to this crate are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this crate adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [3.6.0] - 2023-06-15
+
+### Added
+
+- Added `Decode::decode_into` to allow deserializing into unitialized memory.
+- Added a `DecodeFinished` type to be used with `Decode::decode_into`.
+
+### Fixed
+
+- Trying to deserialize a big boxed array (e.g. `Box<[u8; 1024 * 1024 * 1024]>`) won't overflow the stack anymore.
+- Trying to deserialize big nested enums with many variants won't overflow the stack anymore.
+- Elements of partially read arrays will now be properly dropped if the whole array wasn't decoded.
+
+### Changed
+
+- The derive macros will now be reexported only when the `derive` feature is enabled,
+  as opposed to how it was previously where enabling `parity-scale-codec-derive` would suffice.
+- The `max-encoded-len` feature won't automatically enable the derive macros nor pull in the
+  `parity-scale-codec-derive` dependency.
+
 ## [3.5.0]
 
 ### Added
diff --git a/Cargo.lock b/Cargo.lock
index c59460a8..67a320f1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11,6 +11,12 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
 [[package]]
 name = "arbitrary"
 version = "1.0.3"
@@ -85,18 +91,6 @@ dependencies = [
  "wyz",
 ]
 
-[[package]]
-name = "bstr"
-version = "0.2.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
-dependencies = [
- "lazy_static",
- "memchr",
- "regex-automata",
- "serde",
-]
-
 [[package]]
 name = "bumpalo"
 version = "3.12.0"
@@ -123,12 +117,9 @@ checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be"
 
 [[package]]
 name = "cast"
-version = "0.2.7"
+version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a"
-dependencies = [
- "rustc_version",
-]
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
@@ -142,15 +133,52 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "ciborium"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
 [[package]]
 name = "clap"
-version = "2.33.3"
+version = "3.2.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002"
+checksum = "4ea181bf566f71cb9a5d17a59e1871af638180a18fb0035c92ae62b705207123"
 dependencies = [
  "bitflags",
+ "clap_lex",
+ "indexmap",
  "textwrap",
- "unicode-width",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2850f2f5a82cbf437dd5af4d49848fbdfc27c157c3d010345776f952765261c5"
+dependencies = [
+ "os_str_bytes",
 ]
 
 [[package]]
@@ -165,15 +193,16 @@ dependencies = [
 
 [[package]]
 name = "criterion"
-version = "0.3.5"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10"
+checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb"
 dependencies = [
+ "anes",
  "atty",
  "cast",
+ "ciborium",
  "clap",
  "criterion-plot",
- "csv",
  "itertools",
  "lazy_static",
  "num-traits",
@@ -182,7 +211,6 @@ dependencies = [
  "rayon",
  "regex",
  "serde",
- "serde_cbor",
  "serde_derive",
  "serde_json",
  "tinytemplate",
@@ -191,9 +219,9 @@ dependencies = [
 
 [[package]]
 name = "criterion-plot"
-version = "0.4.4"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
 dependencies = [
  "cast",
  "itertools",
@@ -243,28 +271,6 @@ dependencies = [
  "lazy_static",
 ]
 
-[[package]]
-name = "csv"
-version = "1.1.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
-dependencies = [
- "bstr",
- "csv-core",
- "itoa",
- "ryu",
- "serde",
-]
-
-[[package]]
-name = "csv-core"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "derive_arbitrary"
 version = "1.0.2"
@@ -367,6 +373,12 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
 
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
 [[package]]
 name = "hermit-abi"
 version = "0.1.19"
@@ -405,6 +417,16 @@ dependencies = [
  "syn 1.0.98",
 ]
 
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown",
+]
+
 [[package]]
 name = "instant"
 version = "0.1.12"
@@ -538,9 +560,15 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
+[[package]]
+name = "os_str_bytes"
+version = "6.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267"
+
 [[package]]
 name = "parity-scale-codec"
-version = "3.5.0"
+version = "3.6.0"
 dependencies = [
  "arbitrary",
  "arrayvec",
@@ -561,7 +589,7 @@ dependencies = [
 
 [[package]]
 name = "parity-scale-codec-derive"
-version = "3.1.4"
+version = "3.6.0"
 dependencies = [
  "parity-scale-codec",
  "proc-macro-crate",
@@ -765,12 +793,6 @@ dependencies = [
  "regex-syntax",
 ]
 
-[[package]]
-name = "regex-automata"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
-
 [[package]]
 name = "regex-syntax"
 version = "0.6.26"
@@ -835,35 +857,28 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
 
 [[package]]
 name = "semver"
-version = "1.0.4"
+version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "568a8e6258aa33c13358f81fd834adb854c6f7c9468520910a9b1e8fac068012"
+checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed"
 
 [[package]]
 name = "serde"
 version = "1.0.164"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d"
-
-[[package]]
-name = "serde_cbor"
-version = "0.11.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5"
 dependencies = [
- "half",
- "serde",
+ "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.163"
+version = "1.0.164"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c805777e3930c8883389c602315a24224bcc738b63905ef87cd1420353ea93e"
+checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.12",
+ "syn 2.0.16",
 ]
 
 [[package]]
@@ -890,9 +905,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.12"
+version = "2.0.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "79d9531f94112cfc3e4c8f5f02cb2b58f72c97b7efd85f70203cc6d8efda5927"
+checksum = "a6f671d4b5ffdb8eadec19c0ae67fe2639df8684bd7bc4b83d986b8db549cf01"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -929,12 +944,9 @@ dependencies = [
 
 [[package]]
 name = "textwrap"
-version = "0.11.0"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
-dependencies = [
- "unicode-width",
-]
+checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d"
 
 [[package]]
 name = "thiserror"
@@ -1008,12 +1020,6 @@ version = "1.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5bd2fe26506023ed7b5e1e315add59d6f584c621d037f9368fea9cfb988f368c"
 
-[[package]]
-name = "unicode-width"
-version = "0.1.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973"
-
 [[package]]
 name = "version_check"
 version = "0.9.3"
diff --git a/Cargo.toml b/Cargo.toml
index 7a9fa913..f43c6cf5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "parity-scale-codec"
 description = "SCALE - Simple Concatenating Aggregated Little Endians"
-version = "3.5.0"
+version = "3.6.0"
 authors = ["Parity Technologies <admin@parity.io>"]
 license = "Apache-2.0"
 repository = "https://github.com/paritytech/parity-scale-codec"
@@ -12,7 +12,7 @@ rust-version = "1.60.0"
 [dependencies]
 arrayvec = { version = "0.7", default-features = false }
 serde = { version = "1.0.164", optional = true }
-parity-scale-codec-derive = { path = "derive", version = "3.1.4", default-features = false, optional = true }
+parity-scale-codec-derive = { path = "derive", version = ">= 3.6.0", default-features = false, optional = true }
 bitvec = { version = "1", default-features = false, features = [ "alloc" ], optional = true }
 bytes = { version = "1", default-features = false, optional = true }
 byte-slice-cast = { version = "1.2.2", default-features = false }
@@ -21,7 +21,7 @@ arbitrary = { version = "1.0.1", features = ["derive"], optional = true }
 impl-trait-for-tuples = "0.2.2"
 
 [dev-dependencies]
-criterion = "0.3.0"
+criterion = "0.4.0"
 serde_derive = { version = "1.0" }
 parity-scale-codec-derive = { path = "derive", default-features = false }
 quickcheck = "1.0"
@@ -46,7 +46,7 @@ fuzz = ["std", "arbitrary"]
 # Enables the new `MaxEncodedLen` trait.
 # NOTE: This is still considered experimental and is exempt from the usual
 # SemVer guarantees. We do not guarantee no code breakage when using this.
-max-encoded-len = ["parity-scale-codec-derive/max-encoded-len"]
+max-encoded-len = ["parity-scale-codec-derive?/max-encoded-len"]
 
 # Make error fully descriptive with chaining error message.
 # Should not be used in a constrained environment.
diff --git a/README.md b/README.md
index dadbc926..8b079051 100644
--- a/README.md
+++ b/README.md
@@ -7,13 +7,12 @@ SCALE is a light-weight format which allows encoding (and decoding) which makes
 suitable for resource-constrained execution environments like blockchain runtimes and low-power,
 low-memory devices.
 
-It is important to note that the encoding context (knowledge of how the types and data structures look)
-needs to be known separately at both encoding and decoding ends.
+It is important to note that the encoding context (knowledge of how the types and data
+structures look) needs to be known separately at both encoding and decoding ends.
 The encoded data does not include this contextual information.
 
 To get a better understanding of how the encoding is done for different types,
-take a look at the
-[low-level data formats overview page at the Substrate docs site](https://docs.substrate.io/reference/scale-codec/).
+take a look at the ["Type encoding (SCALE)" page in Substrate docs](https://docs.substrate.io/reference/scale-codec/).
 
 ## Implementation
 
@@ -21,45 +20,52 @@ The codec is implemented using the following traits:
 
 ### Encode
 
-The `Encode` trait is used for encoding of data into the SCALE format. The `Encode` trait contains the following functions:
+The `Encode` trait is used for encoding of data into the SCALE format. The `Encode` trait
+contains the following functions:
+
 * `size_hint(&self) -> usize`: Gets the capacity (in bytes) required for the encoded data.
-This is to avoid double-allocation of memory needed for the encoding.
-It can be an estimate and does not need to be an exact number.
-If the size is not known, even no good maximum, then we can skip this function from the trait implementation.
-This is required to be a cheap operation, so should not involve iterations etc.
-* `encode_to<T: Output>(&self, dest: &mut T)`: Encodes the value and appends it to a destination buffer.
+  This is to avoid double-allocation of memory needed for the encoding. It can be an estimate
+  and does not need to be an exact number. If the size is not known, even no good maximum, then
+  we can skip this function from the trait implementation. This is required to be a cheap operation,
+  so should not involve iterations etc.
+* `encode_to<T: Output>(&self, dest: &mut T)`: Encodes the value and appends it to a destination
+  buffer.
 * `encode(&self) -> Vec<u8>`: Encodes the type data and returns a slice.
-* `using_encoded<R, F: FnOnce(&[u8]) -> R>(&self, f: F) -> R`: Encodes the type data and executes a closure on the encoded value.
-Returns the result from the executed closure.
+* `using_encoded<R, F: FnOnce(&[u8]) -> R>(&self, f: F) -> R`: Encodes the type data and
+  executes a closure on the encoded value. Returns the result from the executed closure.
 
-**Note:** Implementations should override `using_encoded` for value types and `encode_to` for allocating types.
-`size_hint` should be implemented for all types, wherever possible. Wrapper types should override all methods.
+**Note:** Implementations should override `using_encoded` for value types and `encode_to` for
+allocating types. `size_hint` should be implemented for all types, wherever possible. Wrapper
+types should override all methods.
 
 ### Decode
 
-The `Decode` trait is used for deserialization/decoding of encoded data into the respective types.
+The `Decode` trait is used for deserialization/decoding of encoded data into the respective
+types.
 
-* `fn decode<I: Input>(value: &mut I) -> Result<Self, Error>`: Tries to decode the value from SCALE format to the type it is called on.
-Returns an `Err` if the decoding fails.
+* `fn decode<I: Input>(value: &mut I) -> Result<Self, Error>`: Tries to decode the value from
+  SCALE format to the type it is called on. Returns an `Err` if the decoding fails.
 
 ### CompactAs
 
-The `CompactAs` trait is used for wrapping custom types/structs as compact types, which makes them even more space/memory efficient.
-The compact encoding is described [here](https://docs.substrate.io/reference/scale-codec/#fnref-1).
+The `CompactAs` trait is used for wrapping custom types/structs as compact types, which makes
+them even more space/memory efficient. The compact encoding is described [here](https://docs.substrate.io/reference/scale-codec/#fn-1).
 
 * `encode_as(&self) -> &Self::As`: Encodes the type (self) as a compact type.
-The type `As` is defined in the same trait and its implementation should be compact encode-able.
-* `decode_from(_: Self::As) -> Result<Self, Error>`: Decodes the type (self) from a compact encode-able type.
+  The type `As` is defined in the same trait and its implementation should be compact encode-able.
+* `decode_from(_: Self::As) -> Result<Self, Error>`: Decodes the type (self) from a compact
+  encode-able type.
 
 ### HasCompact
 
-The `HasCompact` trait, if implemented, tells that the corresponding type is a compact encode-able type.
+The `HasCompact` trait, if implemented, tells that the corresponding type is a compact
+encode-able type.
 
 ### EncodeLike
 
 The `EncodeLike` trait needs to be implemented for each type manually. When using derive, it is
-done automatically for you. Basically the trait gives you the opportunity to accept multiple types
-to a function that all encode to the same representation.
+done automatically for you. Basically the trait gives you the opportunity to accept multiple
+types to a function that all encode to the same representation.
 
 ## Usage Examples
 
@@ -68,19 +74,21 @@ Following are some examples to demonstrate usage of the codec.
 ### Simple types
 
 ```rust
+# // Import macros if derive feature is not used.
+# #[cfg(not(feature="derive"))]
+# use parity_scale_codec_derive::{Encode, Decode};
 
 use parity_scale_codec::{Encode, Decode};
-use parity_scale_codec_derive::{Encode, Decode};
 
 #[derive(Debug, PartialEq, Encode, Decode)]
 enum EnumType {
-	#[codec(index = 15)]
-	A,
-	B(u32, u64),
-	C {
-		a: u32,
-		b: u64,
-	},
+    #[codec(index = 15)]
+    A,
+    B(u32, u64),
+    C {
+        a: u32,
+        b: u64,
+    },
 }
 
 let a = EnumType::A;
@@ -111,14 +119,17 @@ assert_eq!(EnumType::decode(&mut dc).ok(), Some(c));
 let mut dz: &[u8] = &[0];
 assert_eq!(EnumType::decode(&mut dz).ok(), None);
 
+# fn main() { }
 ```
 
 ### Compact type with HasCompact
 
 ```rust
+# // Import macros if derive feature is not used.
+# #[cfg(not(feature="derive"))]
+# use parity_scale_codec_derive::{Encode, Decode};
 
 use parity_scale_codec::{Encode, Decode, Compact, HasCompact};
-use parity_scale_codec_derive::{Encode, Decode};
 
 #[derive(Debug, PartialEq, Encode, Decode)]
 struct Test1CompactHasCompact<T: HasCompact> {
@@ -138,10 +149,15 @@ let encoded = Test1HasCompact { bar: test_val.0 }.encode();
 assert_eq!(encoded.len(), test_val.1);
 assert_eq!(<Test1CompactHasCompact<u64>>::decode(&mut &encoded[..]).unwrap().bar, test_val.0);
 
+# fn main() { }
 ```
+
 ### Type with CompactAs
 
 ```rust
+# // Import macros if derive feature is not used.
+# #[cfg(not(feature="derive"))]
+# use parity_scale_codec_derive::{Encode, Decode};
 
 use serde_derive::{Serialize, Deserialize};
 use parity_scale_codec::{Encode, Decode, Compact, HasCompact, CompactAs, Error};
@@ -182,28 +198,41 @@ let a = TestGenericHasCompact::A::<StructHasCompact> {
 let encoded = a.encode();
 assert_eq!(encoded.len(), 2);
 
+# fn main() { }
 ```
 
 ## Derive attributes
 
 The derive implementation supports the following attributes:
-- `codec(dumb_trait_bound)`: This attribute needs to be placed above the type that one of the trait
-  should be implemented for. It will make the algorithm that determines the to-add trait bounds
-  fall back to just use the type parameters of the type. This can be useful for situation where
-  the algorithm includes private types in the public interface. By using this attribute, you should
-  not get this error/warning again.
+- `codec(dumb_trait_bound)`: This attribute needs to be placed above the type that one of the
+  trait should be implemented for. It will make the algorithm that determines the to-add trait
+  bounds fall back to just use the type parameters of the type. This can be useful for situation
+  where the algorithm includes private types in the public interface. By using this attribute,
+  you should not get this error/warning again.
 - `codec(skip)`: Needs to be placed above a field  or variant and makes it to be skipped while
   encoding/decoding.
 - `codec(compact)`: Needs to be placed above a field and makes the field use compact encoding.
   (The type needs to support compact encoding.)
-- `codec(encoded_as = "OtherType")`: Needs to be placed above a field and makes the field being encoded
-  by using `OtherType`.
+- `codec(encoded_as = "OtherType")`: Needs to be placed above a field and makes the field being
+  encoded by using `OtherType`.
 - `codec(index = 0)`: Needs to be placed above an enum variant to make the variant use the given
   index when encoded. By default the index is determined by counting from `0` beginning wth the
   first variant.
-- `codec(crate = path::to::crate)`: Specify a path to the parity-scale-codec crate instance to use
-  when referring to Codec APIs from generated code. This is normally only applicable when invoking
-  re-exported Codec derives from a public macro in a different crate.
-
+- `codec(encode_bound)`, `codec(decode_bound)` and `codec(mel_bound)`: All 3 attributes take
+  in a `where` clause for the `Encode`, `Decode` and `MaxEncodedLen` trait implementation for
+  the annotated type respectively.
+- `codec(encode_bound(skip_type_params))`, `codec(decode_bound(skip_type_params))` and
+  `codec(mel_bound(skip_type_params))`: All 3 sub-attributes take in types as arguments to skip
+  trait derivation of the corresponding trait, e.g. T in
+  `codec(encode_bound(skip_type_params(T)))` will not contain a `Encode` trait bound while
+  `Encode` is being derived for the annotated type.
+
+## Known issues
+
+Even though this crate supports deserialization of arbitrarily sized array (e.g. `[T; 1024 * 1024 * 1024]`)
+using such types is not recommended and will most likely result in a stack overflow. If you have a big
+array inside of your structure which you want to decode you should wrap it in a `Box`, e.g. `Box<[T; 1024 * 1024 * 1024]>`.
+
+-------------------------
 
 License: Apache-2.0
diff --git a/benches/benches.rs b/benches/benches.rs
index 248147e2..f52e9fbd 100644
--- a/benches/benches.rs
+++ b/benches/benches.rs
@@ -113,46 +113,57 @@ fn vec_append_with_encode_append(b: &mut Bencher) {
 }
 
 fn encode_decode_vec<T: TryFrom<u8> + Codec>(c: &mut Criterion) where T::Error: std::fmt::Debug {
-	c.bench_function_over_inputs(&format!("vec_encode_{}", type_name::<T>()), |b, &vec_size| {
-		let vec: Vec<T> = (0..=127u8)
-			.cycle()
-			.take(vec_size)
-			.map(|v| v.try_into().unwrap())
-			.collect();
-
-		let vec = black_box(vec);
-		b.iter(|| vec.encode())
-	}, vec![1, 2, 5, 32, 1024, 2048, 16384]);
-
-	c.bench_function_over_inputs(&format!("vec_decode_{}", type_name::<T>()), |b, &vec_size| {
-		let vec: Vec<T> = (0..=127u8)
-			.cycle()
-			.take(vec_size)
-			.map(|v| v.try_into().unwrap())
-			.collect();
-
-		let vec = vec.encode();
-
-		let vec = black_box(vec);
-		b.iter(|| {
-			let _: Vec<T> = Decode::decode(&mut &vec[..]).unwrap();
-		})
-	}, vec![1, 2, 5, 32, 1024, 2048, 16384]);
-
-	c.bench_function_over_inputs(&format!("vec_decode_no_limit_{}", type_name::<T>()), |b, &vec_size| {
-		let vec: Vec<T> = (0..=127u8)
-			.cycle()
-			.take(vec_size)
-			.map(|v| v.try_into().unwrap())
-			.collect();
-
-		let vec = vec.encode();
-
-		let vec = black_box(vec);
-		b.iter(|| {
-			let _: Vec<T> = Decode::decode(&mut NoLimitInput(&vec[..])).unwrap();
-		})
-	}, vec![16384, 131072]);
+	let mut g = c.benchmark_group("vec_encode");
+	for vec_size in [1, 2, 5, 32, 1024, 2048, 16384] {
+		g.bench_with_input(&format!("{}/{}", type_name::<T>(), vec_size), &vec_size, |b, &vec_size| {
+			let vec: Vec<T> = (0..=127u8)
+				.cycle()
+				.take(vec_size)
+				.map(|v| v.try_into().unwrap())
+				.collect();
+
+			let vec = black_box(vec);
+			b.iter(|| vec.encode())
+		});
+	}
+
+	core::mem::drop(g);
+	let mut g = c.benchmark_group("vec_decode");
+	for vec_size in [1, 2, 5, 32, 1024, 2048, 16384] {
+		g.bench_with_input(&format!("{}/{}", type_name::<T>(), vec_size), &vec_size, |b, &vec_size| {
+			let vec: Vec<T> = (0..=127u8)
+				.cycle()
+				.take(vec_size)
+				.map(|v| v.try_into().unwrap())
+				.collect();
+
+			let vec = vec.encode();
+
+			let vec = black_box(vec);
+			b.iter(|| {
+				let _: Vec<T> = Decode::decode(&mut &vec[..]).unwrap();
+			})
+		});
+	}
+
+	core::mem::drop(g);
+	let mut g = c.benchmark_group("vec_decode_no_limit");
+	for vec_size in [16384, 131072] {
+		g.bench_with_input(&format!("vec_decode_no_limit_{}/{}", type_name::<T>(), vec_size), &vec_size, |b, &vec_size| {
+			let vec: Vec<T> = (0..=127u8)
+				.cycle()
+				.take(vec_size)
+				.map(|v| v.try_into().unwrap())
+				.collect();
+
+			let vec = vec.encode();
+
+			let vec = black_box(vec);
+			b.iter(|| {
+				let _: Vec<T> = Decode::decode(&mut NoLimitInput(&vec[..])).unwrap();
+			})
+		});
+	}
 }
 
 fn encode_decode_complex_type(c: &mut Criterion) {
@@ -168,25 +179,33 @@ fn encode_decode_complex_type(c: &mut Criterion) {
 		ComplexType { _val: 1000, _other_val: 0980345634635, _vec: vec![1, 2, 3, 5, 6, 7] },
 		ComplexType { _val: 43564, _other_val: 342342345634635, _vec: vec![1, 2, 3, 5, 6, 7] },
 	];
-	let complex_types2 = complex_types.clone();
 
-	c.bench_function_over_inputs("vec_encode_complex_type", move |b, &vec_size| {
-		let vec: Vec<ComplexType> = complex_types.clone().into_iter().cycle().take(vec_size).collect();
+	let mut g = c.benchmark_group("vec_encode_complex_type");
+	for vec_size in [1, 2, 5, 32, 1024, 2048, 16384] {
+		let complex_types = complex_types.clone();
+		g.bench_with_input(format!("vec_encode_complex_type/{}", vec_size), &vec_size, move |b, &vec_size| {
+			let vec: Vec<ComplexType> = complex_types.clone().into_iter().cycle().take(vec_size).collect();
 
-		let vec = black_box(vec);
-		b.iter(|| vec.encode())
-	}, vec![1, 2, 5, 32, 1024, 2048, 16384]);
+			let vec = black_box(vec);
+			b.iter(|| vec.encode())
+		});
+	}
 
-	c.bench_function_over_inputs("vec_decode_complex_type", move |b, &vec_size| {
-		let vec: Vec<ComplexType> = complex_types2.clone().into_iter().cycle().take(vec_size).collect();
+	core::mem::drop(g);
+	let mut g = c.benchmark_group("vec_decode_complex_type");
+	for vec_size in [1, 2, 5, 32, 1024, 2048, 16384] {
+		let complex_types = complex_types.clone();
+		g.bench_with_input(format!("vec_decode_complex_type/{}", vec_size), &vec_size, move |b, &vec_size| {
+			let vec: Vec<ComplexType> = complex_types.clone().into_iter().cycle().take(vec_size).collect();
 
-		let vec = vec.encode();
+			let vec = vec.encode();
 
-		let vec = black_box(vec);
-		b.iter(|| {
-			let _: Vec<ComplexType> = Decode::decode(&mut &vec[..]).unwrap();
-		})
-	}, vec![1, 2, 5, 32, 1024, 2048, 16384]);
+			let vec = black_box(vec);
+			b.iter(|| {
+				let _: Vec<ComplexType> = Decode::decode(&mut &vec[..]).unwrap();
+			})
+		});
+	}
 }
 
 fn bench_fn(c: &mut Criterion) {
@@ -202,34 +221,44 @@ fn encode_decode_bitvec_u8(c: &mut Criterion) {
 	let _ = c;
 
 	#[cfg(feature = "bit-vec")]
-	c.bench_function_over_inputs("bitvec_u8_encode - BitVec<u8>", |b, &size| {
-		let vec: BitVec<u8, Lsb0> = [true, false]
-			.iter()
-			.cloned()
-			.cycle()
-			.take(size)
-			.collect();
-
-		let vec = black_box(vec);
-		b.iter(|| vec.encode())
-	}, vec![1, 2, 5, 32, 1024]);
+	{
+		let mut g = c.benchmark_group("bitvec_u8_encode");
+		for size in [1, 2, 5, 32, 1024] {
+			g.bench_with_input(size.to_string(), &size, |b, &size| {
+				let vec: BitVec<u8, Lsb0> = [true, false]
+					.iter()
+					.cloned()
+					.cycle()
+					.take(size)
+					.collect();
+
+				let vec = black_box(vec);
+				b.iter(|| vec.encode())
+			});
+		}
+	}
 
 	#[cfg(feature = "bit-vec")]
-	c.bench_function_over_inputs("bitvec_u8_decode - BitVec<u8>", |b, &size| {
-		let vec: BitVec<u8, Lsb0> = [true, false]
-			.iter()
-			.cloned()
-			.cycle()
-			.take(size)
-			.collect();
-
-		let vec = vec.encode();
-
-		let vec = black_box(vec);
-		b.iter(|| {
-			let _: BitVec<u8, Lsb0> = Decode::decode(&mut &vec[..]).unwrap();
-		})
-	}, vec![1, 2, 5, 32, 1024]);
+	{
+		let mut g = c.benchmark_group("bitvec_u8_decode");
+		for size in [1, 2, 5, 32, 1024] {
+			g.bench_with_input(size.to_string(), &size, |b, &size| {
+				let vec: BitVec<u8, Lsb0> = [true, false]
+					.iter()
+					.cloned()
+					.cycle()
+					.take(size)
+					.collect();
+
+				let vec = vec.encode();
+
+				let vec = black_box(vec);
+				b.iter(|| {
+					let _: BitVec<u8, Lsb0> = Decode::decode(&mut &vec[..]).unwrap();
+				})
+			});
+		}
+	}
 }
 
 criterion_group!{
diff --git a/derive/Cargo.toml b/derive/Cargo.toml
index 77a330e3..575d7565 100644
--- a/derive/Cargo.toml
+++ b/derive/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "parity-scale-codec-derive"
 description = "Serialization and deserialization derive macro for Parity SCALE Codec"
-version = "3.1.4"
+version = "3.6.0"
 authors = ["Parity Technologies <admin@parity.io>"]
 license = "Apache-2.0"
 edition = "2021"
@@ -17,7 +17,7 @@ proc-macro2 = "1.0.60"
 proc-macro-crate = "1.1.3"
 
 [dev-dependencies]
-parity-scale-codec = { path = "..", features = ["max-encoded-len"] }
+parity-scale-codec = { path = "..", features = ["derive", "max-encoded-len"] }
 
 [features]
 # Enables the new `MaxEncodedLen` trait.
diff --git a/derive/src/decode.rs b/derive/src/decode.rs
index df603902..5787ff5b 100644
--- a/derive/src/decode.rs
+++ b/derive/src/decode.rs
@@ -72,7 +72,12 @@ pub fn quote(
 
 				quote_spanned! { v.span() =>
 					__codec_x_edqy if __codec_x_edqy == #index as ::core::primitive::u8 => {
-						#create
+						// NOTE: This lambda is necessary to work around an upstream bug
+						// where each extra branch results in excessive stack usage:
+						//   https://github.com/rust-lang/rust/issues/34283
+						return (move || {
+							#create
+						})();
 					},
 				}
 			});
@@ -90,9 +95,13 @@ pub fn quote(
 					.map_err(|e| e.chain(#read_byte_err_msg))?
 				{
 					#( #recurse )*
-					_ => ::core::result::Result::Err(
-						<_ as ::core::convert::Into<_>>::into(#invalid_variant_err_msg)
-					),
+					_ => {
+						return (move || {
+							return ::core::result::Result::Err(
+								<_ as ::core::convert::Into<_>>::into(#invalid_variant_err_msg)
+							);
+						})();
+					},
 				}
 			}
 
diff --git a/src/codec.rs b/src/codec.rs
index db46843b..5ccc30ae 100644
--- a/src/codec.rs
+++ b/src/codec.rs
@@ -22,11 +22,9 @@ use core::{
 	mem,
 	mem::{
 		MaybeUninit,
-		forget,
 	},
 	ops::{Deref, Range, RangeInclusive},
 	time::Duration,
-	ptr,
 };
 use core::num::{
 	NonZeroI8,
@@ -56,6 +54,7 @@ use crate::alloc::{
 	vec::Vec,
 };
 use crate::compact::Compact;
+use crate::DecodeFinished;
 use crate::encode_like::EncodeLike;
 use crate::Error;
 
@@ -296,6 +295,24 @@ pub trait Decode: Sized {
 	/// Attempt to deserialise the value from input.
 	fn decode<I: Input>(input: &mut I) -> Result<Self, Error>;
 
+	/// Attempt to deserialize the value from input into a pre-allocated piece of memory.
+	///
+	/// The default implementation will just call [`Decode::decode`].
+	///
+	/// # Safety
+	///
+	/// If this function returns `Ok` then `dst` **must** be properly initialized.
+	///
+	/// This is enforced by requiring the implementation to return a [`DecodeFinished`]
+	/// which can only be created by calling [`DecodeFinished::assert_decoding_finished`] which is `unsafe`.
+	fn decode_into<I: Input>(input: &mut I, dst: &mut MaybeUninit<Self>) -> Result<DecodeFinished, Error> {
+		let value = Self::decode(input)?;
+		dst.write(value);
+
+		// SAFETY: We've written the decoded value to `dst` so calling this is safe.
+		unsafe { Ok(DecodeFinished::assert_decoding_finished()) }
+	}
+
 	/// Attempt to skip the encoded value from input.
 	///
 	/// The default implementation of this function is just calling [`Decode::decode`].
@@ -488,29 +505,101 @@ impl<T, X> Encode for X where
 pub trait WrapperTypeDecode: Sized {
 	/// A wrapped type.
 	type Wrapped: Into<Self>;
+
+	// !INTERNAL USE ONLY!
+	// This is a used to specialize `decode` for the wrapped type.
+	#[doc(hidden)]
+	#[inline]
+	fn decode_wrapped<I: Input>(input: &mut I) -> Result<Self, Error> where Self::Wrapped: Decode {
+		input.descend_ref()?;
+		let result = Ok(Self::Wrapped::decode(input)?.into());
+		input.ascend_ref();
+		result
+	}
 }
+
 impl<T> WrapperTypeDecode for Box<T> {
 	type Wrapped = T;
+
+	fn decode_wrapped<I: Input>(input: &mut I) -> Result<Self, Error> where Self::Wrapped: Decode {
+		input.descend_ref()?;
+
+		// Placement new is not yet stable, but we can just manually allocate a chunk of memory
+		// and convert it to a `Box` ourselves.
+		//
+		// The explicit types here are written out for clarity.
+		//
+		// TODO: Use `Box::new_uninit` once that's stable.
+		let layout = core::alloc::Layout::new::<MaybeUninit<T>>();
+
+		let ptr: *mut MaybeUninit<T> = if layout.size() == 0 {
+			core::ptr::NonNull::dangling().as_ptr()
+		} else {
+
+			// SAFETY: Layout has a non-zero size so calling this is safe.
+			let ptr: *mut u8 = unsafe {
+				crate::alloc::alloc::alloc(layout)
+			};
+
+			if ptr.is_null() {
+				crate::alloc::alloc::handle_alloc_error(layout);
+			}
+
+			ptr.cast()
+		};
+
+		// SAFETY: Constructing a `Box` from a piece of memory allocated with `std::alloc::alloc`
+		//         is explicitly allowed as long as it was allocated with the global allocator
+		//         and the memory layout matches.
+		//
+		//         Constructing a `Box` from `NonNull::dangling` is also always safe as long
+		//         as the underlying type is zero-sized.
+		let mut boxed: Box<MaybeUninit<T>> = unsafe { Box::from_raw(ptr) };
+
+		T::decode_into(input, &mut boxed)?;
+
+		// Decoding succeeded, so let's get rid of `MaybeUninit`.
+		//
+		// TODO: Use `Box::assume_init` once that's stable.
+		let ptr: *mut MaybeUninit<T> = Box::into_raw(boxed);
+		let ptr: *mut T = ptr.cast();
+
+		// SAFETY: `MaybeUninit` doesn't affect the memory layout, so casting the pointer back
+		//         into a `Box` is safe.
+		let boxed: Box<T> = unsafe { Box::from_raw(ptr) };
+
+		input.ascend_ref();
+		Ok(boxed)
+	}
 }
+
 impl<T> WrapperTypeDecode for Rc<T> {
 	type Wrapped = T;
+
+	fn decode_wrapped<I: Input>(input: &mut I) -> Result<Self, Error> where Self::Wrapped: Decode {
+		// TODO: This is inefficient; use `Rc::new_uninit` once that's stable.
+		Box::<T>::decode(input).map(|output| output.into())
+	}
 }
+
 #[cfg(target_has_atomic = "ptr")]
 impl<T> WrapperTypeDecode for Arc<T> {
 	type Wrapped = T;
+
+	fn decode_wrapped<I: Input>(input: &mut I) -> Result<Self, Error> where Self::Wrapped: Decode {
+		// TODO: This is inefficient; use `Arc::new_uninit` once that's stable.
+		Box::<T>::decode(input).map(|output| output.into())
+	}
 }
 
 impl<T, X> Decode for X where
 	T: Decode + Into<X>,
 	X: WrapperTypeDecode<Wrapped=T>,
 {
+	#[inline]
 	fn decode<I: Input>(input: &mut I) -> Result<Self, Error> {
-		input.descend_ref()?;
-		let result = Ok(T::decode(input)?.into());
-		input.ascend_ref();
-		result
+		Self::decode_wrapped(input)
 	}
-
 }
 
 /// A macro that matches on a [`TypeInfo`] and expands a given macro per variant.
@@ -732,90 +821,6 @@ pub(crate) fn encode_slice_no_len<T: Encode, W: Output + ?Sized>(slice: &[T], de
 	}
 }
 
-/// Decode the array.
-///
-/// This is equivalent to decoding all the element one by one, but it is optimized for some types.
-#[inline]
-pub(crate) fn decode_array<I: Input, T: Decode, const N: usize>(input: &mut I) -> Result<[T; N], Error> {
-	#[inline]
-	fn general_array_decode<I: Input, T: Decode, const N: usize>(input: &mut I) -> Result<[T; N], Error> {
-		let mut uninit = <MaybeUninit<[T; N]>>::uninit();
-		// The following line coerces the pointer to the array to a pointer
-		// to the first array element which is equivalent.
-		let mut ptr = uninit.as_mut_ptr() as *mut T;
-		for _ in 0..N {
-			let decoded = T::decode(input)?;
-			// SAFETY: We do not read uninitialized array contents
-			//		 while initializing them.
-			unsafe {
-				ptr::write(ptr, decoded);
-			}
-			// SAFETY: Point to the next element after every iteration.
-			//		 We do this N times therefore this is safe.
-			ptr = unsafe { ptr.add(1) };
-		}
-		// SAFETY: All array elements have been initialized above.
-		let init = unsafe { uninit.assume_init() };
-		Ok(init)
-	}
-
-	// Description for the code below.
-	// It is not possible to transmute `[u8; N]` into `[T; N]` due to this issue:
-	// 		https://github.com/rust-lang/rust/issues/61956
-	//
-	// Workaround: Transmute `&[u8; N]` into `&[T; N]` and interpret that reference as value.
-	// ```
-	// let mut array: [u8; N] = [0; N];
-	// let ref_typed: &[T; N] = unsafe { mem::transmute(&array) };
-	// let typed: [T; N] = unsafe { ptr::read(ref_typed) };
-	// forget(array);
-	// Here `array` and `typed` points on the same memory.
-	// Function returns `typed` -> it is not dropped, but `array` will be dropped.
-	// To avoid that `array` should be forgotten.
-	// ```
-	macro_rules! decode {
-		( u8 ) => {{
-			let mut array: [u8; N] = [0; N];
-			input.read(&mut array[..])?;
-			let ref_typed: &[T; N] = unsafe { mem::transmute(&array) };
-			let typed: [T; N] = unsafe { ptr::read(ref_typed) };
-			forget(array);
-			Ok(typed)
-		}};
-		( i8 ) => {{
-			let mut array: [i8; N] = [0; N];
-			let bytes = unsafe { mem::transmute::<&mut [i8], &mut [u8]>(&mut array[..]) };
-			input.read(bytes)?;
-
-			let ref_typed: &[T; N] = unsafe { mem::transmute(&array) };
-			let typed: [T; N] = unsafe { ptr::read(ref_typed) };
-			forget(array);
-			Ok(typed)
-		}};
-		( $ty:ty ) => {{
-			if cfg!(target_endian = "little") {
-				let mut array: [$ty; N] = [0 as $ty; N];
-				let bytes = <[$ty] as AsMutByteSlice<$ty>>::as_mut_byte_slice(&mut array[..]);
-				input.read(bytes)?;
-				let ref_typed: &[T; N] = unsafe { mem::transmute(&array) };
-				let typed: [T; N] = unsafe { ptr::read(ref_typed) };
-				forget(array);
-				Ok(typed)
-			} else {
-				general_array_decode(input)
-			}
-		}};
-	}
-
-	with_type_info! {
-		<T as Decode>::TYPE_INFO,
-		decode,
-		{
-			general_array_decode(input)
-		},
-	}
-}
-
 /// Decode the vec (without a prepended len).
 ///
 /// This is equivalent to decode all elements one by one, but it is optimized in some
@@ -884,10 +889,133 @@ impl<T: Encode, const N: usize> Encode for [T; N] {
 	}
 }
 
+const fn calculate_array_bytesize<T, const N: usize>() -> usize {
+	struct AssertNotOverflow<T, const N: usize>(core::marker::PhantomData<T>);
+
+	impl<T, const N: usize> AssertNotOverflow<T, N> {
+		const OK: () = assert!(core::mem::size_of::<T>().checked_mul(N).is_some(), "array size overflow");
+	}
+
+	let () = AssertNotOverflow::<T, N>::OK;
+	core::mem::size_of::<T>() * N
+}
+
 impl<T: Decode, const N: usize> Decode for [T; N] {
-	#[inline]
+	#[inline(always)]
 	fn decode<I: Input>(input: &mut I) -> Result<Self, Error> {
-		decode_array(input)
+		let mut array = MaybeUninit::uninit();
+		Self::decode_into(input, &mut array)?;
+
+		// SAFETY: `decode_into` succeeded, so the array is initialized.
+		unsafe {
+			Ok(array.assume_init())
+		}
+	}
+
+    fn decode_into<I: Input>(input: &mut I, dst: &mut MaybeUninit<Self>) -> Result<DecodeFinished, Error> {
+		let is_primitive = match <T as Decode>::TYPE_INFO {
+			| TypeInfo::U8
+			| TypeInfo::I8
+				=> true,
+			| TypeInfo::U16
+			| TypeInfo::I16
+			| TypeInfo::U32
+			| TypeInfo::I32
+			| TypeInfo::U64
+			| TypeInfo::I64
+			| TypeInfo::U128
+			| TypeInfo::I128
+			| TypeInfo::F32
+			| TypeInfo::F64
+				=> cfg!(target_endian = "little"),
+			TypeInfo::Unknown => false
+		};
+
+		if is_primitive {
+			// Let's read the array in bulk as that's going to be a lot
+			// faster than just reading each element one-by-one.
+
+			let ptr: *mut [T; N] = dst.as_mut_ptr();
+			let ptr: *mut u8 = ptr.cast();
+
+			let bytesize = calculate_array_bytesize::<T, N>();
+
+			// TODO: This is potentially slow; it'd be better if `Input` supported
+			//       reading directly into uninitialized memory.
+			//
+			// SAFETY: The pointer is valid and points to a memory `bytesize` bytes big.
+			unsafe {
+				ptr.write_bytes(0, bytesize);
+			}
+
+			// SAFETY: We've zero-initialized everything so creating a slice here is safe.
+			let slice: &mut [u8] = unsafe {
+				core::slice::from_raw_parts_mut(ptr, bytesize)
+			};
+
+			input.read(slice)?;
+
+			// SAFETY: We've initialized the whole slice so calling this is safe.
+			unsafe {
+				return Ok(DecodeFinished::assert_decoding_finished());
+			}
+		}
+
+		let slice: &mut [MaybeUninit<T>; N] = {
+			let ptr: *mut [T; N] = dst.as_mut_ptr();
+			let ptr: *mut [MaybeUninit<T>; N] = ptr.cast();
+			// SAFETY: Casting `&mut MaybeUninit<[T; N]>` into `&mut [MaybeUninit<T>; N]` is safe.
+			unsafe { &mut *ptr }
+		};
+
+		/// A wrapper type to make sure the partially read elements are always
+		/// dropped in case an error occurs or the underlying `decode` implementation panics.
+		struct State<'a, T, const N: usize> {
+			count: usize,
+			slice: &'a mut [MaybeUninit<T>; N]
+		}
+
+		impl<'a, T, const N: usize> Drop for State<'a, T, N> {
+			fn drop(&mut self) {
+				if !core::mem::needs_drop::<T>() {
+					// If the types don't actually need to be dropped then don't even
+					// try to run the loop below.
+					//
+					// Most likely won't make a difference in release mode, but will
+					// make a difference in debug mode.
+					return;
+				}
+
+				// TODO: Use `MaybeUninit::slice_assume_init_mut` + `core::ptr::drop_in_place`
+				//       once `slice_assume_init_mut` is stable.
+				for item in &mut self.slice[..self.count] {
+					// SAFETY: Each time we've read a new element we incremented `count`,
+					//         and we only drop at most `count` elements here,
+					//         so all of the elements we drop here are valid.
+					unsafe {
+						item.assume_init_drop();
+					}
+				}
+			}
+		}
+
+		let mut state = State {
+			count: 0,
+			slice
+		};
+
+		while state.count < state.slice.len() {
+			T::decode_into(input, &mut state.slice[state.count])?;
+			state.count += 1;
+		}
+
+		// We've successfully read everything, so disarm the `Drop` impl.
+		core::mem::forget(state);
+
+		// SAFETY: We've initialized the whole slice so calling this is safe.
+		unsafe {
+			return Ok(DecodeFinished::assert_decoding_finished());
+		}
 	}
 
 	fn skip<I: Input>(input: &mut I) -> Result<(), Error> {
diff --git a/src/decode_finished.rs b/src/decode_finished.rs
new file mode 100644
index 00000000..aa6feb98
--- /dev/null
+++ b/src/decode_finished.rs
@@ -0,0 +1,36 @@
+// Copyright (C) Parity Technologies (UK) Ltd.
+// SPDX-License-Identifier: Apache-2.0
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 	http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Contains the [`DecodeFinished`] type, sequestered into its own module
+//! to prevent its direct construction in the whole crate.
+
+use core::marker::PhantomData;
+
+/// A zero-sized type signifying that the decoding finished.
+///
+/// To be used in [`Decode::decode_into`] to allow the implementation to explicitly
+/// assert that the `MaybeUninit` passed into that function was properly initialized.
+pub struct DecodeFinished(PhantomData<*const ()>);
+
+impl DecodeFinished {
+	/// Assert that the decoding has finished.
+	///
+	/// Should be used in [`Decode::decode_into`] to signify that
+	/// the `MaybeUninit` passed into that function was properly initialized.
+	#[inline]
+	pub unsafe fn assert_decoding_finished() -> DecodeFinished {
+		DecodeFinished(PhantomData)
+	}
+}
diff --git a/src/lib.rs b/src/lib.rs
index ea999180..133c736d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -12,229 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//! # Parity SCALE Codec
-//!
-//! Rust implementation of the SCALE (Simple Concatenated Aggregate Little-Endian) data format
-//! for types used in the Parity Substrate framework.
-//!
-//! SCALE is a light-weight format which allows encoding (and decoding) which makes it highly
-//! suitable for resource-constrained execution environments like blockchain runtimes and low-power,
-//! low-memory devices.
-//!
-//! It is important to note that the encoding context (knowledge of how the types and data
-//! structures look) needs to be known separately at both encoding and decoding ends.
-//! The encoded data does not include this contextual information.
-//!
-//! To get a better understanding of how the encoding is done for different types,
-//! take a look at the
-//! [SCALE Code page at the Substrate Knowledge Base](https://docs.substrate.io/v3/advanced/scale-codec/).
-//!
-//! ## Implementation
-//!
-//! The codec is implemented using the following traits:
-//!
-//! ### Encode
-//!
-//! The `Encode` trait is used for encoding of data into the SCALE format. The `Encode` trait
-//! contains the following functions:
-
-//!
-//! * `size_hint(&self) -> usize`: Gets the capacity (in bytes) required for the encoded data.
-//! This is to avoid double-allocation of memory needed for the encoding.
-//! It can be an estimate and does not need to be an exact number.
-//! If the size is not known, even no good maximum, then we can skip this function from the trait
-//! implementation. This is required to be a cheap operation, so should not involve iterations etc.
-//! * `encode_to<T: Output>(&self, dest: &mut T)`: Encodes the value and appends it to a destination
-//!   buffer.
-//! * `encode(&self) -> Vec<u8>`: Encodes the type data and returns a slice.
-//! * `using_encoded<R, F: FnOnce(&[u8]) -> R>(&self, f: F) -> R`: Encodes the type data and
-//!   executes a closure on the encoded value.
-//! Returns the result from the executed closure.
-//!
-//! **Note:** Implementations should override `using_encoded` for value types and `encode_to` for
-//! allocating types. `size_hint` should be implemented for all types, wherever possible. Wrapper
-//! types should override all methods.
-//!
-//! ### Decode
-//!
-//! The `Decode` trait is used for deserialization/decoding of encoded data into the respective
-//! types.
-//!
-//! * `fn decode<I: Input>(value: &mut I) -> Result<Self, Error>`: Tries to decode the value from
-//!   SCALE format to the type it is called on.
-//! Returns an `Err` if the decoding fails.
-//!
-//! ### CompactAs
-//!
-//! The `CompactAs` trait is used for wrapping custom types/structs as compact types, which makes
-//! them even more space/memory efficient. The compact encoding is described [here](https://docs.substrate.io/v3/advanced/scale-codec/#compactgeneral-integers).
-//!
-//! * `encode_as(&self) -> &Self::As`: Encodes the type (self) as a compact type.
-//! The type `As` is defined in the same trait and its implementation should be compact encode-able.
-//! * `decode_from(_: Self::As) -> Result<Self, Error>`: Decodes the type (self) from a compact
-//!   encode-able type.
-//!
-//! ### HasCompact
-//!
-//! The `HasCompact` trait, if implemented, tells that the corresponding type is a compact
-//! encode-able type.
-//!
-//! ### EncodeLike
-//!
-//! The `EncodeLike` trait needs to be implemented for each type manually. When using derive, it is
-//! done automatically for you. Basically the trait gives you the opportunity to accept multiple
-//! types to a function that all encode to the same representation.
-//!
-//! ## Usage Examples
-//!
-//! Following are some examples to demonstrate usage of the codec.
-//!
-//! ### Simple types
-//!
-#![cfg_attr(feature = "derive", doc = "```rust")]
-#![cfg_attr(not(feature = "derive"), doc = "*(Only compiled with feature `derive`)*\n```ignore")]
-//! use parity_scale_codec::{Encode, Decode};
-//!
-//! #[derive(Debug, PartialEq, Encode, Decode)]
-//! enum EnumType {
-//! 	#[codec(index = 15)]
-//! 	A,
-//! 	B(u32, u64),
-//! 	C {
-//! 		a: u32,
-//! 		b: u64,
-//! 	},
-//! }
-//!
-//! let a = EnumType::A;
-//! let b = EnumType::B(1, 2);
-//! let c = EnumType::C { a: 1, b: 2 };
-//!
-//! a.using_encoded(|ref slice| {
-//!     assert_eq!(slice, &b"\x0f");
-//! });
-//!
-//! b.using_encoded(|ref slice| {
-//!     assert_eq!(slice, &b"\x01\x01\0\0\0\x02\0\0\0\0\0\0\0");
-//! });
-//!
-//! c.using_encoded(|ref slice| {
-//!     assert_eq!(slice, &b"\x02\x01\0\0\0\x02\0\0\0\0\0\0\0");
-//! });
-//!
-//! let mut da: &[u8] = b"\x0f";
-//! assert_eq!(EnumType::decode(&mut da).ok(), Some(a));
-//!
-//! let mut db: &[u8] = b"\x01\x01\0\0\0\x02\0\0\0\0\0\0\0";
-//! assert_eq!(EnumType::decode(&mut db).ok(), Some(b));
-//!
-//! let mut dc: &[u8] = b"\x02\x01\0\0\0\x02\0\0\0\0\0\0\0";
-//! assert_eq!(EnumType::decode(&mut dc).ok(), Some(c));
-//!
-//! let mut dz: &[u8] = &[0];
-//! assert_eq!(EnumType::decode(&mut dz).ok(), None);
-//!
-//! # fn main() { }
-//! ```
-//!
-//! ### Compact type with HasCompact
-//!
-#![cfg_attr(feature = "derive", doc = "```rust")]
-#![cfg_attr(not(feature = "derive"), doc = "*(Only compiled with feature `derive`)*\n```ignore")]
-//! use parity_scale_codec::{Encode, Decode, Compact, HasCompact};
-//!
-//! #[derive(Debug, PartialEq, Encode, Decode)]
-//! struct Test1CompactHasCompact<T: HasCompact> {
-//!     #[codec(compact)]
-//!     bar: T,
-//! }
-//!
-//! #[derive(Debug, PartialEq, Encode, Decode)]
-//! struct Test1HasCompact<T: HasCompact> {
-//!     #[codec(encoded_as = "<T as HasCompact>::Type")]
-//!     bar: T,
-//! }
-//!
-//! let test_val: (u64, usize) = (0u64, 1usize);
-//!
-//! let encoded = Test1HasCompact { bar: test_val.0 }.encode();
-//! assert_eq!(encoded.len(), test_val.1);
-//! assert_eq!(<Test1CompactHasCompact<u64>>::decode(&mut &encoded[..]).unwrap().bar, test_val.0);
-//!
-//! # fn main() { }
-//! ```
-//! ### Type with CompactAs
-//!
-#![cfg_attr(feature = "derive", doc = "```rust")]
-#![cfg_attr(not(feature = "derive"), doc = "*(Only compiled with feature `derive`)*\n```ignore")]
-//! use serde_derive::{Serialize, Deserialize};
-//! use parity_scale_codec::{Encode, Decode, Compact, HasCompact, CompactAs, Error};
-//!
-//! #[cfg_attr(feature = "std", derive(Serialize, Deserialize, Debug))]
-//! #[derive(PartialEq, Eq, Clone)]
-//! struct StructHasCompact(u32);
-//!
-//! impl CompactAs for StructHasCompact {
-//!     type As = u32;
-//!
-//!     fn encode_as(&self) -> &Self::As {
-//!         &12
-//!     }
-//!
-//!     fn decode_from(_: Self::As) -> Result<Self, Error> {
-//!         Ok(StructHasCompact(12))
-//!     }
-//! }
-//!
-//! impl From<Compact<StructHasCompact>> for StructHasCompact {
-//!     fn from(_: Compact<StructHasCompact>) -> Self {
-//!         StructHasCompact(12)
-//!     }
-//! }
-//!
-//! #[derive(Debug, PartialEq, Encode, Decode)]
-//! enum TestGenericHasCompact<T> {
-//!     A {
-//!         #[codec(compact)] a: T
-//!     },
-//! }
-//!
-//! let a = TestGenericHasCompact::A::<StructHasCompact> {
-//!     a: StructHasCompact(12325678),
-//! };
-//!
-//! let encoded = a.encode();
-//! assert_eq!(encoded.len(), 2);
-//!
-//! # fn main() { }
-//! ```
-//!
-//! ## Derive attributes
-//!
-//! The derive implementation supports the following attributes:
-//! - `codec(dumb_trait_bound)`: This attribute needs to be placed above the type that one of the
-//!   trait should be implemented for. It will make the algorithm that determines the to-add trait
-//!   bounds fall back to just use the type parameters of the type. This can be useful for situation
-//!   where the algorithm includes private types in the public interface. By using this attribute,
-//!   you should not get this error/warning again.
-//! - `codec(skip)`: Needs to be placed above a field  or variant and makes it to be skipped while
-//!   encoding/decoding.
-//! - `codec(compact)`: Needs to be placed above a field and makes the field use compact encoding.
-//!   (The type needs to support compact encoding.)
-//! - `codec(encoded_as = "OtherType")`: Needs to be placed above a field and makes the field being
-//!   encoded by using `OtherType`.
-//! - `codec(index = 0)`: Needs to be placed above an enum variant to make the variant use the given
-//!   index when encoded. By default the index is determined by counting from `0` beginning wth the
-//!   first variant.
-//! - `codec(encode_bound)`, `codec(decode_bound)` and `codec(mel_bound)`: All 3 attributes take
-//!   in a `where` clause for the `Encode`, `Decode` and `MaxEncodedLen` trait implementation for
-//!   the annotated type respectively.
-//! - `codec(encode_bound(skip_type_params))`, `codec(decode_bound(skip_type_params))` and
-//!   `codec(mel_bound(skip_type_params))`: All 3 sub-attributes take in types as arguments to skip
-//!   trait derivation of the corresponding trait, e.g. T in
-//!   `codec(encode_bound(skip_type_params(T)))` will not contain a `Encode` trait bound while
-//!   `Encode` is being derived for the annotated type.
-
+#![doc = include_str!("../README.md")]
 #![warn(missing_docs)]
 #![cfg_attr(not(feature = "std"), no_std)]
 
@@ -243,7 +21,7 @@
 #[doc(hidden)]
 pub extern crate alloc;
 
-#[cfg(feature = "parity-scale-codec-derive")]
+#[cfg(feature = "derive")]
 #[allow(unused_imports)]
 #[macro_use]
 extern crate parity_scale_codec_derive;
@@ -252,7 +30,7 @@ extern crate parity_scale_codec_derive;
 #[macro_use]
 extern crate serde_derive;
 
-#[cfg(feature = "parity-scale-codec-derive")]
+#[cfg(feature = "derive")]
 pub use parity_scale_codec_derive::*;
 
 #[cfg(feature = "std")]
@@ -265,6 +43,7 @@ pub mod alloc {
 	pub use std::collections;
 	pub use std::sync;
 	pub use std::rc;
+	pub use std::alloc;
 }
 
 mod codec;
@@ -276,6 +55,7 @@ mod bit_vec;
 #[cfg(feature = "generic-array")]
 mod generic_array;
 mod decode_all;
+mod decode_finished;
 mod depth_limit;
 mod encode_append;
 mod encode_like;
@@ -288,7 +68,7 @@ mod const_encoded_len;
 pub use self::error::Error;
 pub use self::codec::{
 	Input, Output, Decode, Encode, Codec, EncodeAsRef, WrapperTypeEncode, WrapperTypeDecode,
-	OptionBool, DecodeLength, FullCodec, FullEncode, decode_vec_with_len
+	OptionBool, DecodeLength, FullCodec, FullEncode, decode_vec_with_len,
 };
 #[cfg(feature = "std")]
 pub use self::codec::IoReader;
@@ -296,6 +76,7 @@ pub use self::compact::{Compact, HasCompact, CompactAs, CompactLen, CompactRef};
 pub use self::joiner::Joiner;
 pub use self::keyedvec::KeyedVec;
 pub use self::decode_all::DecodeAll;
+pub use self::decode_finished::DecodeFinished;
 pub use self::depth_limit::DecodeLimit;
 pub use self::encode_append::EncodeAppend;
 pub use self::encode_like::{EncodeLike, Ref};
diff --git a/tests/mod.rs b/tests/mod.rs
index ac6e3fd3..166e6750 100644
--- a/tests/mod.rs
+++ b/tests/mod.rs
@@ -643,3 +643,168 @@ fn no_warning_for_deprecated() {
 		VariantB,
 	}
 }
+
+#[test]
+fn decoding_a_huge_array_inside_of_box_does_not_overflow_the_stack() {
+	let data = &[];
+	let _ = Box::<[u8; 100 * 1024 * 1024]>::decode(&mut data.as_slice());
+}
+
+#[test]
+fn decoding_a_huge_array_inside_of_rc_does_not_overflow_the_stack() {
+	let data = &[];
+	let _ = std::rc::Rc::<[u8; 100 * 1024 * 1024]>::decode(&mut data.as_slice());
+}
+
+#[test]
+fn decoding_a_huge_array_inside_of_arc_does_not_overflow_the_stack() {
+	let data = &[];
+	let _ = std::sync::Arc::<[u8; 100 * 1024 * 1024]>::decode(&mut data.as_slice());
+}
+
+#[test]
+fn decoding_an_array_of_boxed_zero_sized_types_works() {
+	#[cfg(not(miri))]
+	const SIZE: usize = 100 * 1024 * 1024;
+
+	#[cfg(miri)]
+	const SIZE: usize = 1024;
+
+	let data = &[];
+	assert!(Box::<[(); SIZE]>::decode(&mut data.as_slice()).is_ok());
+}
+
+#[test]
+fn incomplete_decoding_of_an_array_drops_partially_read_elements_if_reading_fails() {
+	thread_local! {
+		pub static COUNTER: core::cell::Cell<usize> = core::cell::Cell::new(0);
+	}
+
+	#[derive(DeriveDecode)]
+	struct Foobar(u8);
+
+	impl Drop for Foobar {
+		fn drop(&mut self) {
+			COUNTER.with(|counter| {
+				counter.set(counter.get() + 1);
+			});
+		}
+	}
+
+	let data = &[1, 2, 3];
+	assert!(<[Foobar; 4]>::decode(&mut data.as_slice()).is_err());
+
+	COUNTER.with(|counter| {
+		assert_eq!(counter.get(), 3);
+	});
+}
+
+#[test]
+fn incomplete_decoding_of_an_array_drops_partially_read_elements_if_reading_panics() {
+	thread_local! {
+		pub static COUNTER: core::cell::Cell<usize> = core::cell::Cell::new(0);
+	}
+
+	struct Foobar(u8);
+
+	impl Decode for Foobar {
+		fn decode<I: parity_scale_codec::Input>(input: &mut I) -> Result<Self, Error> {
+			let mut buffer = [0; 1];
+			input.read(&mut buffer).unwrap();
+			Ok(Self(buffer[0]))
+		}
+	}
+
+	impl Drop for Foobar {
+		fn drop(&mut self) {
+			COUNTER.with(|counter| {
+				counter.set(counter.get() + 1);
+			});
+		}
+	}
+
+	let data = &[1, 2, 3];
+	let result = std::panic::catch_unwind(|| {
+		let _ = <[Foobar; 4]>::decode(&mut data.as_slice());
+	});
+
+	assert!(result.is_err());
+
+	COUNTER.with(|counter| {
+		assert_eq!(counter.get(), 3);
+	});
+}
+
+#[test]
+fn deserializing_of_big_recursively_nested_enum_works() {
+	#[derive(PartialEq, Eq, DeriveDecode, DeriveEncode)]
+	struct Data([u8; 1472]);
+
+	#[derive(PartialEq, Eq, DeriveDecode, DeriveEncode)]
+	enum Enum {
+		Nested(Vec<Enum>),
+		Data(Data),
+		Variant1,
+		Variant2,
+		Variant3,
+		Variant4,
+		Variant5,
+		Variant6,
+		Variant7,
+		Variant8,
+		Variant9,
+		Variant10,
+		Variant11,
+		Variant12,
+		Variant13,
+		Variant14,
+		Variant15,
+		Variant16,
+		Variant17,
+		Variant18,
+		Variant19,
+		Variant20,
+		Variant21,
+		Variant22,
+		Variant23,
+		Variant24,
+		Variant25,
+		Variant26,
+		Variant27,
+		Variant28,
+		Variant29,
+		Variant30,
+		Variant31,
+		Variant32,
+		Variant33,
+		Variant34,
+		Variant35,
+		Variant36,
+		Variant37,
+		Variant38,
+		Variant39,
+		Variant40,
+		Variant41,
+	}
+
+	fn gen_dummy_data(depth_remaining: usize) -> Enum {
+		let mut vec = vec![Enum::Data(Data([0; 1472]))];
+		if depth_remaining > 0 {
+			vec.push(gen_dummy_data(depth_remaining - 1));
+		}
+		Enum::Nested(vec)
+	}
+
+	let obj = gen_dummy_data(32);
+	let data = obj.encode();
+
+	// This should not overflow the stack.
+	let obj_d = Enum::decode(&mut &data[..]).unwrap();
+
+	// NOTE: Not using `assert_eq` since we don't want to print out such a big object if this fails.
+	assert!(obj == obj_d);
+
+	use parity_scale_codec::DecodeLimit;
+	let obj_d2 = Enum::decode_with_depth_limit(40, &mut &data[..]).unwrap();
+	assert!(obj == obj_d2);
+}