Skip to content

Commit

Permalink
perf(plugin): optimize aws s3 file listing
Browse files Browse the repository at this point in the history
Lower number of requests sent to S3 while listing objects. For buckets
with large number of objects, listing can take some time, because there
is a request for metadata sent to s3 for each object in the bucket.
This is redundant, because all data needed for objects listing are
available in S3ObjectSummary. This reduce listing time significantly.

Resolves: #490
  • Loading branch information
bjarosze authored and fhussonnois committed Jul 5, 2023
1 parent d8e8df8 commit ce329c0
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,6 @@ public Collection<FileObjectMeta> listObjects() {

objectMetaList.addAll(objectListing.getObjectSummaries()
.stream()
.map(s3ObjectSummary ->
new S3BucketKey(
s3ObjectSummary.getBucketName(),
s3ObjectSummary.getKey()
))
.map(s3Storage::getObjectMetadata)
.filter(Objects::nonNull)
.collect(Collectors.toList()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import com.amazonaws.services.s3.model.GetObjectMetadataRequest;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.amazonaws.services.s3.model.StorageClass;
import io.streamthoughts.kafka.connect.filepulse.source.FileObjectMeta;
import io.streamthoughts.kafka.connect.filepulse.source.GenericFileObjectMeta;
Expand Down Expand Up @@ -186,6 +187,13 @@ public FileObjectMeta getObjectMetadata(final S3BucketKey s3Object) {
);
}

public FileObjectMeta getObjectMetadata(final S3ObjectSummary s3ObjectSummary) {
return createFileObjectMeta(
new S3BucketKey(s3ObjectSummary.getBucketName(), s3ObjectSummary.getKey()),
s3ObjectSummary
);
}

private ObjectMetadata loadObjectMetadata(final S3BucketKey s3Object) {
var request = new GetObjectMetadataRequest(s3Object.bucketName(), s3Object.key());
try {
Expand Down Expand Up @@ -264,11 +272,11 @@ private static FileObjectMeta createFileObjectMeta(final S3BucketKey s3Object,
userDefinedMetadata.put("s3.object.summary.etag", objectMetadata.getETag());
userDefinedMetadata.put("s3.object.summary.storageClass", objectMetadata.getStorageClass());

final String contentMD5 = objectMetadata.getContentMD5();
final String contentMD5 = objectMetadata.getETag();

FileObjectMeta.ContentDigest digest = null;
if (contentMD5 != null) {
digest = new FileObjectMeta.ContentDigest(contentMD5, "MD5");
digest = new FileObjectMeta.ContentDigest(contentMD5, "ETAG");
}

return new GenericFileObjectMeta.Builder()
Expand All @@ -280,4 +288,22 @@ private static FileObjectMeta createFileObjectMeta(final S3BucketKey s3Object,
.withUserDefinedMetadata(userDefinedMetadata)
.build();
}

private static FileObjectMeta createFileObjectMeta(final S3BucketKey s3Object,
final S3ObjectSummary s3ObjectSummary) {
final String contentMD5 = s3ObjectSummary.getETag();

FileObjectMeta.ContentDigest digest = null;
if (contentMD5 != null) {
digest = new FileObjectMeta.ContentDigest(contentMD5, "ETAG");
}

return new GenericFileObjectMeta.Builder()
.withUri(s3Object.toURI())
.withName(s3ObjectSummary.getKey())
.withContentLength(s3ObjectSummary.getSize())
.withLastModified(s3ObjectSummary.getLastModified())
.withContentDigest(digest)
.build();
}
}

0 comments on commit ce329c0

Please sign in to comment.