-
Notifications
You must be signed in to change notification settings - Fork 615
HDDS-11063. TestSnapshotDiffManager#testThreadPoolIsFull is flaky without wait between batches #10581
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
HDDS-11063. TestSnapshotDiffManager#testThreadPoolIsFull is flaky without wait between batches #10581
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -95,11 +95,8 @@ | |
| import java.util.Optional; | ||
| import java.util.Set; | ||
| import java.util.UUID; | ||
| import java.util.concurrent.CountDownLatch; | ||
| import java.util.concurrent.ExecutionException; | ||
| import java.util.concurrent.ExecutorService; | ||
| import java.util.concurrent.Future; | ||
| import java.util.concurrent.SynchronousQueue; | ||
| import java.util.concurrent.ThreadPoolExecutor; | ||
| import java.util.concurrent.TimeUnit; | ||
| import java.util.concurrent.atomic.AtomicInteger; | ||
| import java.util.function.BiFunction; | ||
|
|
@@ -1170,105 +1167,110 @@ private void uploadSnapshotDiffJobToDb(SnapshotInfo fromSnapshot, | |
|
|
||
| private static Stream<Arguments> threadPoolFullScenarios() { | ||
| return Stream.of( | ||
| Arguments.of("When there is a wait time between job batches", | ||
| 500L, 45, 0), | ||
| Arguments.of("When there is no wait time between job batches", | ||
| 0L, 20, 25) | ||
| Arguments.of("When the pool drains between job batches", | ||
| true, 45, 0), | ||
| // 10 running + 10 queued = 20 accepted, remaining 25 rejected | ||
| Arguments.of("When the pool does not drain between job batches", | ||
| false, 20, 25) | ||
| ); | ||
| } | ||
|
|
||
| @ParameterizedTest(name = "{0}") | ||
| @MethodSource("threadPoolFullScenarios") | ||
| public void testThreadPoolIsFull(String description, | ||
| long waitBetweenBatches, | ||
| boolean drainBetweenBatches, | ||
| int expectInProgressJobsCount, | ||
| int expectRejectedJobsCount) | ||
| throws Exception { | ||
| ExecutorService executorService = new ThreadPoolExecutor(100, 100, 0, | ||
| TimeUnit.MILLISECONDS, new SynchronousQueue<>() | ||
| ); | ||
|
|
||
| List<SnapshotInfo> snapshotInfos = new ArrayList<>(); | ||
|
|
||
| for (int i = 0; i < 10; i++) { | ||
| UUID snapshotId = UUID.randomUUID(); | ||
| String snapshotName = "snap-" + snapshotId; | ||
| SnapshotInfo snapInfo = new SnapshotInfo.Builder() | ||
| .setSnapshotId(snapshotId) | ||
| .setVolumeName(VOLUME_NAME) | ||
| .setBucketName(BUCKET_NAME) | ||
| .setName(snapshotName) | ||
| .setSnapshotPath("fromSnapshotPath") | ||
| .build(); | ||
| snapshotInfos.add(snapInfo); | ||
|
|
||
| when(snapshotInfoTable.get(getTableKey(VOLUME_NAME, BUCKET_NAME, | ||
| snapshotName))).thenReturn(snapInfo); | ||
| } | ||
|
|
||
| List<SnapshotInfo> snapshotInfos = createTestSnapshots(10); | ||
| SnapshotDiffManager spy = spy(snapshotDiffManager); | ||
|
|
||
| for (int i = 0; i < snapshotInfos.size(); i++) { | ||
| for (int j = i + 1; j < snapshotInfos.size(); j++) { | ||
| String fromSnapshotName = snapshotInfos.get(i).getName(); | ||
| String toSnapshotName = snapshotInfos.get(j).getName(); | ||
| CountDownLatch blockWorkers = new CountDownLatch(1); | ||
| AtomicInteger completedJobs = new AtomicInteger(0); | ||
| doAnswer(invocation -> { | ||
| blockWorkers.await(); | ||
| completedJobs.incrementAndGet(); | ||
| return null; | ||
| }).when(spy).generateSnapshotDiffReport(anyString(), anyString(), | ||
| eq(VOLUME_NAME), eq(BUCKET_NAME), anyString(), anyString(), | ||
| eq(false), eq(false)); | ||
|
|
||
| doAnswer(invocation -> { | ||
| Thread.sleep(250L); | ||
| return null; | ||
| }).when(spy).generateSnapshotDiffReport(anyString(), anyString(), | ||
| eq(VOLUME_NAME), eq(BUCKET_NAME), eq(fromSnapshotName), | ||
| eq(toSnapshotName), eq(false), eq(false)); | ||
| } | ||
| if (drainBetweenBatches) { | ||
| blockWorkers.countDown(); | ||
| } | ||
|
|
||
| List<Future<SnapshotDiffResponse>> futures = new ArrayList<>(); | ||
| for (int i = 0; i < snapshotInfos.size(); i++) { | ||
| for (int j = i + 1; j < snapshotInfos.size(); j++) { | ||
| String fromSnapshotName = snapshotInfos.get(i).getName(); | ||
| String toSnapshotName = snapshotInfos.get(j).getName(); | ||
|
|
||
| Future<SnapshotDiffResponse> future = executorService.submit( | ||
| () -> submitJob(spy, fromSnapshotName, toSnapshotName)); | ||
| futures.add(future); | ||
| try { | ||
| List<SnapshotDiffResponse> responses = new ArrayList<>(); | ||
| int totalSubmitted = 0; | ||
| for (int i = 0; i < snapshotInfos.size(); i++) { | ||
| for (int j = i + 1; j < snapshotInfos.size(); j++) { | ||
| String fromSnapshotName = snapshotInfos.get(i).getName(); | ||
| String toSnapshotName = snapshotInfos.get(j).getName(); | ||
| responses.add(submitJob(spy, fromSnapshotName, toSnapshotName)); | ||
| totalSubmitted++; | ||
| } | ||
| if (drainBetweenBatches) { | ||
| final int expected = totalSubmitted; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not in scope for the current PR to fix flakiness but may be considered as a follow up test improvement:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great idea, I opened HDDS-15648 as a follow-up. |
||
| attempt(() -> { | ||
| if (completedJobs.get() < expected) { | ||
| throw new IllegalStateException("Waiting for jobs to complete"); | ||
| } | ||
| return null; | ||
| }, 50, TimeDuration.valueOf(100, TimeUnit.MILLISECONDS), null, null); | ||
| } | ||
| } | ||
| Thread.sleep(waitBetweenBatches); | ||
| } | ||
|
|
||
| // Wait to make sure that all jobs finish before assertion. | ||
| Thread.sleep(1000L); | ||
| int inProgressJobsCount = 0; | ||
| int rejectedJobsCount = 0; | ||
|
|
||
| for (Future<SnapshotDiffResponse> future : futures) { | ||
| SnapshotDiffResponse response = future.get(); | ||
| if (response.getJobStatus() == IN_PROGRESS) { | ||
| inProgressJobsCount++; | ||
| } else if (response.getJobStatus() == REJECTED) { | ||
| rejectedJobsCount++; | ||
| } else { | ||
| throw new IllegalStateException("Unexpected job status."); | ||
| int inProgressJobsCount = 0; | ||
| int rejectedJobsCount = 0; | ||
| for (SnapshotDiffResponse response : responses) { | ||
| if (response.getJobStatus() == IN_PROGRESS) { | ||
| inProgressJobsCount++; | ||
| } else if (response.getJobStatus() == REJECTED) { | ||
| rejectedJobsCount++; | ||
| } else { | ||
| throw new IllegalStateException("Unexpected job status."); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| assertEquals(expectInProgressJobsCount, inProgressJobsCount); | ||
| assertEquals(expectRejectedJobsCount, rejectedJobsCount); | ||
|
|
||
| int notFoundJobs = 0; | ||
| for (int i = 0; i < snapshotInfos.size(); i++) { | ||
| for (int j = i + 1; j < snapshotInfos.size(); j++) { | ||
| SnapshotDiffJob diffJob = | ||
| getSnapshotDiffJobFromDb(snapshotInfos.get(i), | ||
| snapshotInfos.get(j)); | ||
| if (diffJob == null) { | ||
| notFoundJobs++; | ||
| assertEquals(expectInProgressJobsCount, inProgressJobsCount); | ||
| assertEquals(expectRejectedJobsCount, rejectedJobsCount); | ||
|
|
||
| int notFoundJobs = 0; | ||
| for (int i = 0; i < snapshotInfos.size(); i++) { | ||
| for (int j = i + 1; j < snapshotInfos.size(); j++) { | ||
| SnapshotDiffJob diffJob = | ||
| getSnapshotDiffJobFromDb(snapshotInfos.get(i), | ||
| snapshotInfos.get(j)); | ||
| if (diffJob == null) { | ||
| notFoundJobs++; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // assert that rejected jobs were removed from the job table as well. | ||
| assertEquals(expectRejectedJobsCount, notFoundJobs); | ||
| } finally { | ||
| blockWorkers.countDown(); | ||
| } | ||
| } | ||
|
|
||
| // assert that rejected jobs were removed from the job table as well. | ||
| assertEquals(expectRejectedJobsCount, notFoundJobs); | ||
| executorService.shutdown(); | ||
| private List<SnapshotInfo> createTestSnapshots(int count) throws IOException { | ||
| List<SnapshotInfo> snapshotInfos = new ArrayList<>(); | ||
| for (int i = 0; i < count; i++) { | ||
| UUID snapshotId = UUID.randomUUID(); | ||
| String snapshotName = "snap-" + snapshotId; | ||
| SnapshotInfo snapInfo = new SnapshotInfo.Builder() | ||
| .setSnapshotId(snapshotId) | ||
| .setVolumeName(VOLUME_NAME) | ||
| .setBucketName(BUCKET_NAME) | ||
| .setName(snapshotName) | ||
| .setSnapshotPath("fromSnapshotPath") | ||
| .build(); | ||
| snapshotInfos.add(snapInfo); | ||
| when(snapshotInfoTable.get(getTableKey(VOLUME_NAME, BUCKET_NAME, snapshotName))) | ||
| .thenReturn(snapInfo); | ||
| } | ||
| return snapshotInfos; | ||
| } | ||
|
|
||
| private SnapshotDiffResponse submitJob(SnapshotDiffManager diffManager, | ||
|
|
@@ -1641,7 +1643,7 @@ public void testGetSnapshotDiffReportWhenDone() throws Exception { | |
|
|
||
| SnapshotDiffManager spy = spy(snapshotDiffManager); | ||
| SnapshotDiffReportOzone dummyReport = new SnapshotDiffReportOzone( | ||
| SnapshotDiffManager.getSnapshotRootPath(ctx.volumeName, ctx.bucketName).toString(), | ||
| spy.getSnapshotRootPath(ctx.volumeName, ctx.bucketName).toString(), | ||
| ctx.volumeName, ctx.bucketName, ctx.fromSnapshotName, ctx.toSnapshotName, | ||
| expectedEntries, null); | ||
| doReturn(dummyReport).when(spy).createPageResponse(any(SnapshotDiffJob.class), | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the patch @rhalm.
nit: Use
full_thread_pool_size = 2 * OZONE_OM_SNAPSHOT_DIFF_THREAD_POOL_SIZEand45 - full_thread_pool_sizefor the expected accepted and rejected jobs here.Otherwise, LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the review @SaketaChalamchala, done.