diff --git a/bin/hbase b/bin/hbase index 045b6171fa67..ca6c7cf43f02 100755 --- a/bin/hbase +++ b/bin/hbase @@ -84,6 +84,7 @@ show_usage() { echo " wal Write-ahead-log analyzer" echo " hfile Store file analyzer" echo " sft Store file tracker viewer" + echo " sftrecover Offline store file tracker (FILE) manifest recover tool" echo " zkcli Run the ZooKeeper shell" echo " master Run an HBase HMaster node" echo " regionserver Run an HBase HRegionServer node" @@ -608,6 +609,8 @@ elif [ "$COMMAND" = "hfile" ] ; then CLASS='org.apache.hadoop.hbase.io.hfile.HFilePrettyPrinter' elif [ "$COMMAND" = "sft" ] ; then CLASS='org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileListFilePrettyPrinter' +elif [ "$COMMAND" = "sftrecover" ] ; then + CLASS='org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileListRecoverTool' elif [ "$COMMAND" = "zkcli" ] ; then CLASS="org.apache.hadoop.hbase.zookeeper.ZKMainServer" for f in $HBASE_HOME/lib/zkcli/*.jar; do diff --git a/bin/hbase.cmd b/bin/hbase.cmd index f8111a3bc0a9..d86d14291fa6 100644 --- a/bin/hbase.cmd +++ b/bin/hbase.cmd @@ -439,6 +439,10 @@ goto :eof set CLASS=org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileListFilePrettyPrinter goto :eof +:sftrecover + set CLASS=org.apache.hadoop.hbase.regionserver.storefiletracker.StoreFileListRecoverTool + goto :eof + :zkcli set CLASS=org.apache.hadoop.hbase.zookeeper.ZKMainServer set CLASSPATH=!CLASSPATH!;%HBASE_HOME%\lib\zkcli\* @@ -473,6 +477,7 @@ goto :eof echo wal Write-ahead-log analyzer echo hfile Store file analyzer echo sft Store file tracker viewer + echo sftrecover Offline store file tracker (FILE) manifest recover tool echo zkcli Run the ZooKeeper shell echo master Run an HBase HMaster node echo regionserver Run an HBase HRegionServer node diff --git a/dev-support/design-docs/fsft-manifest-recover.md b/dev-support/design-docs/fsft-manifest-recover.md new file mode 100644 index 000000000000..b80b321820e8 --- /dev/null +++ b/dev-support/design-docs/fsft-manifest-recover.md @@ -0,0 +1,268 @@ +# FSFT Manifest Recover Design + +## Problem + +The FILE store file tracker (FSFT) persists store membership in manifest files under `.filelist`. +If the newest manifest is corrupted in a non-EOF way, `StoreFileListFile.load(...)` fails hard and +region/store open can fail as well. + +For FILE SFT, the manifest can in principle reference store members that do not exist as plain +files in the child family directory: + +- plain HFiles do exist on disk +- virtual split/merge `Reference`s may exist only in the manifest +- virtual `HFileLink`s may exist only in the manifest plus archive back references + +This design adds a single, offline, operator-driven recovery tool that rebuilds a corrupted +manifest **purely from the store directory listing**, plus a non-authoritative data-loss assessment +derived from `hbase:meta` split/merge lineage. + +## Design at a glance + +- **One surface: an offline CLI** (`hbase sftrecover`). There is no online/in-master recovery path. +- **One reconstruction strategy: disk-only.** The recovered manifest is exactly the set of store + files physically present under the family directory (HFiles, references, and links that exist on + disk), filtered by the same rules the `DefaultStoreFileTracker` uses. The tool never synthesizes + references or `HFileLink`s from split/merge lineage and never injects parent-derived entries. +- **A separate, read-only data-loss assessment.** For user-table regions the tool consults + `hbase:meta` for split/merge parents and reports whether bringing the region online risks data + loss, but this assessment never changes the manifest that is written. + +All the logic lives in `StoreFileListRecover`; `StoreFileListRecoverTool` is only the CLI surface +(argument parsing, safety acknowledgements, and report formatting). + +## Why offline-only + +An earlier draft included an online HBCK2-style chained procedure that closed the region, rebuilt +the manifest, and re-opened it. We dropped it: + +- **Nothing in the master can truly fence a RegionServer away from the store directory** while a + manifest is rewritten. The only real quiescence guarantee is that the region is not hosted + anywhere — which is an operator fact, not something a master RPC can assert. The CLI makes the + operator acknowledge this explicitly via `--region-offline`. +- **`master:store` is structurally impossible to recover online** — the procedure store *is* + `master:store`. If its `.filelist` is corrupt, the master JVM aborts during init before + `ProcedureExecutor` comes up. There is nothing to submit a procedure to. An offline tool is the + only mechanism that works for this case. +- A single offline tool that handles all three target shapes (user table, `hbase:meta`, + `master:store`) is far simpler to reason about and to test than a procedure plus an RPC plus a CLI + that share reconstruction code but diverge on orchestration. + +## Targets + +The tool can target three structurally different regions. + +### User-table region + +Standard tables. May split, may merge, may be a snapshot/clone source. `.filelist` can contain +plain HFiles, split-reference files, merge-reference files, and `HFileLink`s. The recovered manifest +is the on-disk file set. Split/merge parents from `hbase:meta` are assessed for data-loss reporting. + +### `hbase:meta` + +Meta has 1 region by design and **never splits or merges**. Enforced at runtime in +`RegionSplitPolicy.shouldSplit(...)`: + +```java +return !region.getRegionInfo().isMetaRegion() && region.isAvailable() ... +``` + +Meta is also never a snapshot source, so its `.filelist` only ever contains plain HFiles produced +by flushes. There is no catalog lineage to assess, so the tool skips the parent meta-walk for meta. +The tool refuses to touch meta unless `--force-meta` is supplied, because recovering meta is only +valid with the master offline. + +### `master:store` (master local region) + +Used to persist the master local store (procedure store, region-state store, RS tracker, server +state). Defined in `MasterRegionFactory`: + +```java +public static final TableName TABLE_NAME = TableName.valueOf("master:store"); +``` + +`MasterRegion.bootstrap(...)` creates a single hard-coded `RegionInfo`. This region never goes +through `SplitTableRegionProcedure` or `MergeTableRegionsProcedure`, is never assigned via +`AssignmentManager`, is never a snapshot source, and lives entirely inside the master JVM. Its CF +directories only ever contain plain HFiles, so there is no catalog lineage to assess — the tool +skips the parent meta-walk (it uses `MasterRegionFactory.TABLE_NAME` to detect this case). + +FILE SFT *is* a supported configuration for `master:store` (the master-store-specific +`hbase.master.store.region.file-tracker.impl` key takes precedence over +`hbase.store.file-tracker.impl`, then `DEFAULT`; `MIGRATION` is rejected, `FILE` is allowed), so +`master:store` corruption from FILE SFT is a real, in-tree-supported failure mode and warrants a +recovery story. This is the case the offline tool is structurally required for: corruption of its +`.filelist` prevents `ProcedureExecutor` from initializing, so no procedure-based recovery flow can +run. + +### Per-target behavior + +| Target | Splits | Merges | Parent assessment | Extra acknowledgement | +|-------------------|--------|--------|-------------------|-----------------------| +| User table region | yes | yes | yes (from `hbase:meta`) | `--region-offline` (or `--dry-run`) | +| `hbase:meta` | no | no | skipped | `--force-meta` + `--region-offline` | +| `master:store` | no | no | skipped | `--region-offline` (master JVM stopped) | + +## User-facing shape + +`StoreFileListRecoverTool` runs in a fresh JVM, talks to HDFS directly, and does not connect to any +master or RegionServer. It lives in the same family as `hbase wal` / `hbase hfile` / `hbase sft` +(i.e., `Configured implements Tool`). + +``` +# User table — rebuild the manifest from disk (region must be offline) +hbase sftrecover --table ns:t --region 3d58e... --columnfamily f --region-offline + +# User table — dry-run (assess and report only; nothing written) +hbase sftrecover --table ns:t --region 3d58e... --columnfamily f --dry-run + +# hbase:meta — master must be stopped first +hbase sftrecover --table hbase:meta --region 1588230740 --columnfamily info \ + --region-offline --force-meta + +# master:store — master JVM must be stopped first +hbase sftrecover --table master:store --region --columnfamily proc \ + --region-offline +``` + +CLI inputs: + +- `-t`/`--table`, `-r`/`--region`, `-cf`/`--columnfamily` +- `--dry-run` — print the recover result (including the data-loss assessment) without writing a new + manifest +- `--region-offline` — operator acknowledgement that the target region is offline (not hosted by any + master/RS). This is the real quiescence guarantee the tool relies on. +- `--force-meta` — allow recovery against `hbase:meta`. Dangerous; only valid with the master + offline. + +CLI exit codes: + +- `0` recover completed (manifest written, dry-run completed, or no-op) +- `1` argument parsing error +- `2` precondition check failed or IO failure during recover + +## Preconditions + +- The operator supplies `--region-offline` (or `--dry-run`). The tool refuses to write a new + manifest otherwise, because it cannot itself prove the region is not hosted somewhere. +- The target table must use the FILE store-file tracker (or MIGRATION). The tool refuses other + trackers because a `.filelist` it writes would not be consulted at runtime. +- For `hbase:meta`, `--force-meta` is required, and the operator must have stopped the master. +- For `master:store`, the operator must have stopped **all** master JVMs. A master started against a + still-corrupt `.filelist` will fail to initialize its `ProcedureExecutor`, so recovery must + complete before any master is restarted. + +## Reconstruction: disk-only + +Enumerate the files that currently exist in the child family directory, filter them with the same +rules used by the `DefaultStoreFileTracker` (`tracker.getStoreFiles(...)`), and build a new manifest +from exactly that set. References and links that physically exist on disk are preserved (the +`Reference` body is carried into the manifest entry); nothing is synthesized. + +This is the only reconstruction mode. The manifest is always exactly what is on disk. + +## Data-loss assessment (reporting only) + +For user-table regions the tool resolves split/merge parents from `hbase:meta`: + +- merge parents are read from the child row's merge qualifiers + (`CatalogFamilyFormat.getMergeRegions`) +- otherwise the table's regions are scanned for a split parent that lists this region as a daughter + (`MetaTableAccessor.getDaughterRegions`) + +For each resolved parent the tool classifies its on-disk archive status. Reference files and +`HFileLink`s in the parent directory are excluded from the count, since they do not represent +unarchived parent data: + +- **`ARCHIVED`** — the parent region directory was not found. The Catalog Janitor only archives a + parent after its daughters have compacted away all references, so in normal operation a missing + parent directory means its data was already propagated into this region. This is an *inference*, + not a verification: a missing directory is also the on-disk symptom of a parent lost (to HDFS + corruption or operator error) *before* archival, so the verdict is reported as "likely" and the + operator is advised to confirm the parent's HFiles exist under the archive if in doubt. +- **`PRESENT_NO_FILES`** — the parent directory exists but carries no unarchived HFiles. +- **`PRESENT_WITH_FILES`** — the parent directory exists and still has unarchived HFiles. + +Verdict: + +- **All parents archived** → `LIKELY NO DATA LOSS`: the parent directories are missing, inferred to + mean their data was archived after being compacted into this region. The disk-only manifest is + authoritative under that inference. +- **Parents present but no unarchived HFiles** → `NO DATA LOSS`: the disk-only manifest is + authoritative. +- **Any parent `PRESENT_WITH_FILES`** → `POTENTIAL DATA LOSS`: the Catalog Janitor had not finished + propagating parent data to this region when the manifest was lost, so the disk-only manifest may + be missing rows. **Manual data recovery may be required** — the operator should review the parent + regions before bringing this region online. + +This assessment is never written into the manifest and never adds entries to it. It only informs the +operator. + +### Known limitation + +`meta` lineage can be stale (e.g. Catalog Janitor scheduled but not yet finished parent GC). In that +window a parent may show `PRESENT_WITH_FILES` even though it is about to be archived. This is +tolerable because the tool is offline and operator-driven: the recommended workflow is `--dry-run` +first, inspect the report, then apply. + +## Manifest write strategy + +Recover never rewrites the corrupted file in place. Instead it: + +1. diagnoses existing `.filelist` files (loads each; records the entry count or the load error) +2. computes the new store-file set from the on-disk listing +3. writes a brand new, strictly-newer tracker generation under `.filelist` via + `StoreFileListFile.writeNew(...)` + +Older (including corrupted) files are left in place in this phase. They are pruned by +`cleanUpTrackFiles(...)` on the next normal `load(false)` once a region opens, which is the moment +HBase already owns a consistent view of the new generation. + +Invariant: the new tracker file uses `seqId = max(now, highestSeqId+1)`. This guarantees: + +- the new file wins the `select(...)` race in `StoreFileListFile.load(boolean)`, +- the new file does not collide with any existing seqId, so the `> 2 files for sequence id` + `DoNotRetryIOException` cannot be triggered. + +### No-op detection + +If the latest healthy tracker file already exposes the same set of store-file names as the +recomputed manifest, the tool reports `No recover needed` and writes nothing. This avoids +unnecessary seqId churn when the operator runs the tool defensively against a healthy store. + +## Safety rules + +- Prefer `--dry-run` first. +- Refuse to write a manifest unless `--region-offline` (or `--dry-run`) is supplied. +- Refuse to recover stores that are not configured to use the FILE (or MIGRATION) tracker. +- Refuse `hbase:meta` without `--force-meta`. +- Never synthesize split/merge artifacts. The manifest is always exactly the on-disk file set. +- The split/merge parent assessment is read-only and best-effort: if `hbase:meta` cannot be reached + or a parent directory cannot be opened, skip that assessment and continue; never abort the + recover. + +## Tests + +`TestStoreFileListRecover` (small test, in-process `HBaseCommonTestingUtil`): + +- corrupted manifest is diagnosed and replaced with a strictly-newer disk-only generation +- recover with no parents is purely disk-only +- archived split parent → `LIKELY NO DATA LOSS` (`allParentsArchived` true, `hasUnarchivedParents` + false), and the manifest contains only the child's own on-disk HFile +- unarchived split parent → `PRESENT_WITH_FILES` / `hasUnarchivedParents` true, and the manifest + still contains no parent-derived entries +- merge with mixed parent status (one archived, one present-with-files) +- dry-run writes nothing and leaves the corrupt file in place +- no-op detection when the current manifest already matches the on-disk set + +## Future direction + +Out of scope for this phase but worth recording so boundaries are explicit: + +- **Bulk recover** wrapper: "recover all corrupted regions in table T". Composes naturally on top of + the single-store tool. +- **Forbid FILE for `master:store`** going forward: extend the existing `MIGRATION` rejection in + `MasterRegionFactory.withTrackerConfigs(...)` to also reject `FILE` for fresh bootstraps. Existing + FILE-imprinted `master:store` regions must keep working, so the check should only fire on + fresh-bootstrap (TD doesn't yet exist on disk). This is preventive only — anyone already on FILE + for `master:store` still needs the offline tool as the recovery path. Tracked separately. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListFile.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListFile.java index 1137f1cf856a..fc88d73c6779 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListFile.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListFile.java @@ -303,4 +303,38 @@ synchronized void resetWriteState() { nextTrackFile = -1; prevTimestamp = -1; } + + /** + * Repair-only write path: write a brand new tracker generation under {@link #TRACK_FILE_DIR} + * without consulting (and without trusting) any existing generation. The new file is written with + * a strictly newer sequence id than any existing tracker file so a subsequent {@code load(false)} + * will pick it as the winner and prune the older (possibly corrupted) generations via + * {@link #cleanUpTrackFiles}. + *

+ * The caller is expected to have decided that an offline repair is required, e.g. because the + * normal {@link #load(boolean)} fails on the latest generation due to checksum, parse or version + * corruption. + *

+ * This method intentionally does NOT delete older tracker files. They are pruned by the next + * regular {@code load(false)} once a region opens, which is the point at which HBase already + * owns a consistent view of the new generation. + */ + Path writeNew(StoreFileList.Builder builder) throws IOException { + NavigableMap> seqId2TrackFiles = listFiles(); + long highestSeqId = seqId2TrackFiles.isEmpty() ? -1L : seqId2TrackFiles.firstKey(); + long seqId = Math.max(EnvironmentEdgeManager.currentTime(), highestSeqId + 1); + FileSystem fs = ctx.getRegionFileSystem().getFileSystem(); + if (!fs.exists(trackFileDir)) { + fs.mkdirs(trackFileDir); + } + Path file = new Path(trackFileDir, TRACK_FILE_PREFIX + TRACK_FILE_SEPARATOR + seqId); + long timestamp = Math.max(prevTimestamp + 1, EnvironmentEdgeManager.currentTime()); + write(fs, file, builder.setTimestamp(timestamp).setVersion(VERSION).build()); + // Reset internal state so that this StoreFileListFile instance is not silently reused for a + // subsequent update() without re-loading. A subsequent caller must run load(false) which will + // see the new generation as the winner and clean up older files. + prevTimestamp = -1; + nextTrackFile = -1; + return file; + } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListFilePrettyPrinter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListFilePrettyPrinter.java index 1025a4759cfb..79b71be202cd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListFilePrettyPrinter.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListFilePrettyPrinter.java @@ -49,6 +49,16 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.StoreFileTrackerProtos.StoreFileList; +/** + * Read-only viewer for FILE store-file-tracker manifests ({@code .filelist}). Prints the store + * file names recorded in a tracker file, either for a directly-specified file or for every tracker + * file currently present under a {@code table/region/family}'s {@code .filelist} directory (each + * file's contents are printed, prefixed by its path; this includes any stale older generations that + * have not yet been pruned, not only the one the runtime would load). + *

+ * This tool does not modify anything. To rebuild a corrupted manifest use the offline + * {@link StoreFileListRecoverTool}. + */ @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS) @InterfaceStability.Evolving public class StoreFileListFilePrettyPrinter extends Configured implements Tool { @@ -126,9 +136,9 @@ public boolean parseOptions(String[] args) throws ParseException, IOException { formatter.printHelp(cmdString, options, true); System.exit(1); } - TableName tn = TableName.valueOf(tableNameWtihNS); - namespace = tn.getNamespaceAsString(); - tableName = tn.getNameAsString(); + TableName targetTableName = TableName.valueOf(tableNameWtihNS); + namespace = targetTableName.getNamespaceAsString(); + tableName = targetTableName.getNameAsString(); } return true; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListRecover.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListRecover.java new file mode 100644 index 000000000000..23b1cf2b8931 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListRecover.java @@ -0,0 +1,548 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver.storefiletracker; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.CatalogFamilyFormat; +import org.apache.hadoop.hbase.MetaTableAccessor; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.io.HFileLink; +import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; +import org.apache.hadoop.hbase.regionserver.StoreContext; +import org.apache.hadoop.hbase.regionserver.StoreFileInfo; +import org.apache.hadoop.hbase.regionserver.StoreUtils; +import org.apache.hadoop.hbase.util.PairOfSameType; +import org.apache.yetus.audience.InterfaceAudience; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hbase.thirdparty.com.google.protobuf.ByteString; + +import org.apache.hadoop.hbase.shaded.protobuf.generated.FSProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.StoreFileTrackerProtos.StoreFileEntry; +import org.apache.hadoop.hbase.shaded.protobuf.generated.StoreFileTrackerProtos.StoreFileList; + +/** + * Offline helper that rebuilds the FILE store-file-tracker manifest for a single store + * (table + region + family) when the latest manifest cannot be loaded. + *

+ * See {@code dev-support/design-docs/fsft-manifest-recover.md} for the full design. + *

+ * The recovered manifest is reconstructed purely from the store directory listing: the + * set of HFiles, references and links physically present under the family directory. Recovery + * never synthesizes references/links from split/merge lineage and never modifies an existing + * manifest in place. It writes a brand new, strictly-newer generation under {@code .filelist} via + * {@link StoreFileListFile#writeNew(StoreFileList.Builder)}, leaving {@code load(false)} to prune + * older files on the next region open. + *

+ * For user-table regions, split/merge parents discovered from {@code hbase:meta} are consulted for + * reporting only: if any parent still has unarchived HFiles on disk, the recovered store + * may be missing data the Catalog Janitor has not yet propagated, and the report flags potential + * data loss so an operator can decide whether a data recovery is required. + */ +@InterfaceAudience.Private +public final class StoreFileListRecover { + + private static final Logger LOG = LoggerFactory.getLogger(StoreFileListRecover.class); + + /** + * Tracks the on-disk archive status of a single split/merge parent region. Recovery uses this to + * distinguish parents that have been fully archived by the Catalog Janitor (no data loss) from + * parents that still have unarchived HFiles (potential data loss requiring operator review). + */ + public static final class ParentContribution { + public enum Status { + /** Parent region directory was not found; Catalog Janitor has archived it. */ + ARCHIVED, + /** Parent region directory exists and still has unarchived HFiles. */ + PRESENT_WITH_FILES, + /** Parent region directory exists but has no unarchived HFiles. */ + PRESENT_NO_FILES + } + + private final RegionInfo parent; + private final Status status; + private final int unarchivedHFileCount; + + ParentContribution(RegionInfo parent, Status status, int unarchivedHFileCount) { + this.parent = parent; + this.status = status; + this.unarchivedHFileCount = unarchivedHFileCount; + } + + public RegionInfo getParent() { + return parent; + } + + public Status getStatus() { + return status; + } + + public int getUnarchivedHFileCount() { + return unarchivedHFileCount; + } + } + + public static final class TrackerFileDiagnostic { + private final Path path; + private final Integer storeFileCount; + private final String error; + + TrackerFileDiagnostic(Path path, Integer storeFileCount, String error) { + this.path = path; + this.storeFileCount = storeFileCount; + this.error = error; + } + + public Path getPath() { + return path; + } + + public Integer getStoreFileCount() { + return storeFileCount; + } + + public String getError() { + return error; + } + + public boolean isCorrupted() { + return error != null; + } + } + + public static final class RecoverReport { + private final List diagnostics; + private final List manifestEntries; + private final List parentContributions; + private final Path writtenManifest; + private final boolean noOp; + + RecoverReport(List diagnostics, List manifestEntries, + List parentContributions, Path writtenManifest, boolean noOp) { + this.diagnostics = Collections.unmodifiableList(new ArrayList<>(diagnostics)); + this.manifestEntries = Collections.unmodifiableList(new ArrayList<>(manifestEntries)); + this.parentContributions = + Collections.unmodifiableList(new ArrayList<>(parentContributions)); + this.writtenManifest = writtenManifest; + this.noOp = noOp; + } + + public List getDiagnostics() { + return diagnostics; + } + + /** The store-file set reconstructed from the store directory; this is what gets written. */ + public List getManifestEntries() { + return manifestEntries; + } + + public List getParentContributions() { + return parentContributions; + } + + public Path getWrittenManifest() { + return writtenManifest; + } + + public boolean isNoOp() { + return noOp; + } + + public boolean hasCorruption() { + for (TrackerFileDiagnostic d : diagnostics) { + if (d.isCorrupted()) { + return true; + } + } + return false; + } + + /** Returns true when at least one parent was assessed and all of them were already archived. */ + public boolean allParentsArchived() { + if (parentContributions.isEmpty()) { + return false; + } + for (ParentContribution pc : parentContributions) { + if (pc.getStatus() != ParentContribution.Status.ARCHIVED) { + return false; + } + } + return true; + } + + /** Returns true when at least one parent still has unarchived HFiles on disk. */ + public boolean hasUnarchivedParents() { + for (ParentContribution pc : parentContributions) { + if (pc.getStatus() == ParentContribution.Status.PRESENT_WITH_FILES) { + return true; + } + } + return false; + } + } + + private StoreFileListRecover() { + } + + /** + * Rebuild the FSFT manifest for a single store from its on-disk file listing. + * @param conf configuration + * @param tableDescriptor descriptor of the store's table + * @param familyDescriptor descriptor of the target column family + * @param regionFs region filesystem opened read-only + * @param parents split/merge parent regions of this region (from {@code hbase:meta}), + * consulted for data-loss reporting only; pass an empty list to skip the + * assessment (e.g. for {@code hbase:meta} / {@code master:store}) + * @param dryRun when true, compute and report but do not write a new manifest + */ + public static RecoverReport recover(Configuration conf, TableDescriptor tableDescriptor, + ColumnFamilyDescriptor familyDescriptor, HRegionFileSystem regionFs, List parents, + boolean dryRun) throws IOException { + StoreContext storeContext = StoreContext.getBuilder() + .withColumnFamilyDescriptor(familyDescriptor) + .withFamilyStoreDirectoryPath(regionFs.getStoreDir(familyDescriptor.getNameAsString())) + .withRegionFileSystem(regionFs).build(); + StoreFileListFile storeFileListFile = new StoreFileListFile(storeContext); + + List diagnostics = + diagnoseTrackerFiles(storeFileListFile, regionFs, familyDescriptor); + + // The manifest is reconstructed purely from the store directory listing. + List manifestEntries = + loadStoreFilesFromDisk(conf, tableDescriptor, familyDescriptor, regionFs); + + // Assess split/merge parents for data-loss reporting only. No references/links are synthesized + // into the manifest from this. + List parentContributions = (parents == null || parents.isEmpty()) + ? Collections.emptyList() + : assessParents(conf, tableDescriptor, familyDescriptor, regionFs, parents); + + // No-op detection: if there is a healthy latest tracker file whose contents already match + // the recomputed set by name, do not churn the seqId. + boolean noOp = isAlreadyHealthy(diagnostics, manifestEntries, storeFileListFile); + + Path writtenManifest = null; + if (!dryRun && !noOp) { + writtenManifest = storeFileListFile.writeNew(toStoreFileListBuilder(manifestEntries)); + LOG.info("Wrote recovered FSFT manifest at {} with {} entries", writtenManifest, + manifestEntries.size()); + } + return new RecoverReport(diagnostics, manifestEntries, parentContributions, writtenManifest, + noOp); + } + + /** + * Resolve the split/merge parent regions for a region by consulting {@code hbase:meta}. Returns + * the merge parents recorded on the region's own row if present; otherwise scans the table's + * regions for a split parent that references this region as a daughter. Returns an empty list if + * the region has no recorded lineage. + * @param conn connection to use for meta lookups; must not be closed by this method + * @param regionInfo the child region whose parents we want + */ + public static List resolveParents(Connection conn, RegionInfo regionInfo) + throws IOException { + Result childRow = MetaTableAccessor.getRegionResult(conn, regionInfo); + if (childRow != null && !childRow.isEmpty()) { + List mergeParents = CatalogFamilyFormat.getMergeRegions(childRow.rawCells()); + if (mergeParents != null && !mergeParents.isEmpty()) { + return new ArrayList<>(mergeParents); + } + } + final RegionInfo[] splitParent = new RegionInfo[1]; + MetaTableAccessor.scanMetaForTableRegions(conn, result -> { + PairOfSameType daughters = MetaTableAccessor.getDaughterRegions(result); + if (regionInfo.equals(daughters.getFirst()) || regionInfo.equals(daughters.getSecond())) { + splitParent[0] = CatalogFamilyFormat.getRegionInfo(result); + return false; + } + return true; + }, regionInfo.getTable()); + return splitParent[0] != null ? Collections.singletonList(splitParent[0]) + : Collections.emptyList(); + } + + /** + * Convenience overload that opens (and closes) its own {@link Connection} from {@code conf}. Use + * from standalone/offline contexts (the {@code sftrecover} CLI). + */ + public static List resolveParents(Configuration conf, RegionInfo regionInfo) + throws IOException { + try (Connection conn = ConnectionFactory.createConnection(conf)) { + return resolveParents(conn, regionInfo); + } + } + + /** + * Returns true when the tracker generation the runtime would actually serve already exposes the + * same store-file name set as the recomputed one, so recovery would only churn the seqId. This is + * best-effort and only avoids unnecessary writes; it never relaxes a safety check. When in doubt + * it returns false, because writing a fresh, strictly-newer generation is always safe. + *

+ * It faithfully mirrors {@link StoreFileListFile#load(boolean)} selection: generations are + * ordered by the numeric seqId parsed from the file name (not lexicographically), and + * within the winning seqId the {@code f1}/{@code f2} rotation pair is disambiguated by the + * internal {@link StoreFileList#getTimestamp()} exactly like {@code select(...)}. Crucially, if + * any corrupted tracker file sits at or above the newest healthy generation, {@code load(false)} + * would hit it first and fail region open, so this is not treated as a no-op. + */ + private static boolean isAlreadyHealthy(List diagnostics, + List manifestEntries, StoreFileListFile storeFileListFile) { + if (diagnostics.isEmpty()) { + // No tracker files at all -> not "already healthy"; we still need to write one if + // there is at least one entry to record. If there are no entries either, treat as no-op. + return manifestEntries.isEmpty(); + } + // Highest-seqId healthy generation, by numeric seqId (mirroring StoreFileListFile.listFiles()). + long newestHealthySeqId = -1L; + for (TrackerFileDiagnostic d : diagnostics) { + if (d.isCorrupted()) { + continue; + } + newestHealthySeqId = Math.max(newestHealthySeqId, parseSeqId(d.getPath())); + } + if (newestHealthySeqId < 0) { + // Every tracker file is corrupted; recovery is definitely needed. + return false; + } + // If a corrupted tracker file has a seqId >= the newest healthy generation, the runtime + // load(false) visits it first and a non-EOF corruption fails region open before the healthy + // generation is ever reached. Recovery is required; do not declare a no-op. + for (TrackerFileDiagnostic d : diagnostics) { + if (d.isCorrupted() && parseSeqId(d.getPath()) >= newestHealthySeqId) { + return false; + } + } + // Among the healthy files sharing the newest seqId there may be an f1/f2 rotation pair carrying + // different internal timestamps; the one with the greater timestamp is what the runtime serves. + StoreFileList winner = null; + for (TrackerFileDiagnostic d : diagnostics) { + if (d.isCorrupted() || parseSeqId(d.getPath()) != newestHealthySeqId) { + continue; + } + try { + StoreFileList candidate = storeFileListFile.load(d.getPath()); + if (winner == null || candidate.getTimestamp() > winner.getTimestamp()) { + winner = candidate; + } + } catch (IOException e) { + // A file previously diagnosed as healthy now fails to load; be conservative and recover. + return false; + } + } + if (winner == null) { + return false; + } + if (winner.getStoreFileCount() != manifestEntries.size()) { + return false; + } + Set expected = new HashSet<>(); + for (StoreFileInfo info : manifestEntries) { + expected.add(info.getPath().getName()); + } + for (StoreFileEntry entry : winner.getStoreFileList()) { + if (!expected.contains(entry.getName())) { + return false; + } + } + return true; + } + + /** + * Parse the numeric seqId encoded in a tracker file name ({@code f1}, {@code f1.}, + * {@code f2.}), mirroring {@link StoreFileListFile#listFiles()}: a missing or unparseable + * suffix yields {@code 0}. The {@link StoreFileListFile#TRACK_FILE_PATTERN} guarantees the suffix + * (when present) is all digits, so this never throws for valid track files. + */ + private static long parseSeqId(Path path) { + String name = path.getName(); + int sep = name.indexOf(StoreFileListFile.TRACK_FILE_SEPARATOR); + if (sep < 0 || sep == name.length() - 1) { + return 0L; + } + try { + return Long.parseLong(name.substring(sep + 1)); + } catch (NumberFormatException e) { + return 0L; + } + } + + private static List diagnoseTrackerFiles( + StoreFileListFile storeFileListFile, HRegionFileSystem regionFs, + ColumnFamilyDescriptor familyDescriptor) throws IOException { + FileSystem fs = regionFs.getFileSystem(); + Path trackFileDir = new Path(regionFs.getStoreDir(familyDescriptor.getNameAsString()), + StoreFileListFile.TRACK_FILE_DIR); + FileStatus[] statuses; + try { + statuses = fs.listStatus(trackFileDir); + } catch (FileNotFoundException e) { + return Collections.emptyList(); + } + if (statuses == null || statuses.length == 0) { + return Collections.emptyList(); + } + List diagnostics = new ArrayList<>(); + for (FileStatus status : statuses) { + Path path = status.getPath(); + if ( + !status.isFile() || !StoreFileListFile.TRACK_FILE_PATTERN.matcher(path.getName()).matches() + ) { + continue; + } + try { + StoreFileList storeFileList = storeFileListFile.load(path); + diagnostics.add(new TrackerFileDiagnostic(path, storeFileList.getStoreFileCount(), null)); + } catch (IOException e) { + diagnostics.add(new TrackerFileDiagnostic(path, null, e.getMessage())); + } + } + return diagnostics; + } + + private static List loadStoreFilesFromDisk(Configuration conf, + TableDescriptor tableDescriptor, ColumnFamilyDescriptor familyDescriptor, + HRegionFileSystem regionFs) throws IOException { + Configuration storeConf = + StoreUtils.createStoreConfiguration(conf, tableDescriptor, familyDescriptor); + StoreContext ctx = StoreContext.getBuilder().withColumnFamilyDescriptor(familyDescriptor) + .withFamilyStoreDirectoryPath(regionFs.getStoreDir(familyDescriptor.getNameAsString())) + .withRegionFileSystem(regionFs).build(); + DefaultStoreFileTracker tracker = new DefaultStoreFileTracker(storeConf, + regionFs.getRegionInfo().getReplicaId() == RegionInfo.DEFAULT_REPLICA_ID, ctx); + List files = tracker.getStoreFiles(familyDescriptor.getNameAsString()); + return files != null ? files : Collections.emptyList(); + } + + /** + * Holds the result of probing a parent region directory: the real (non-reference, non-link) + * HFiles still present, and whether the parent directory was archived (not found). + */ + private static final class ParentLoadResult { + final List hfiles; + final boolean archived; + + ParentLoadResult(List hfiles, boolean archived) { + this.hfiles = hfiles; + this.archived = archived; + } + } + + /** + * Assess each split/merge parent's on-disk archive status for data-loss reporting. This is purely + * diagnostic: it never contributes entries to the recovered manifest. + */ + private static List assessParents(Configuration conf, + TableDescriptor tableDescriptor, ColumnFamilyDescriptor familyDescriptor, + HRegionFileSystem regionFs, List parents) throws IOException { + List contributions = new ArrayList<>(parents.size()); + for (RegionInfo parent : parents) { + ParentLoadResult load = + loadParentHFilesOnly(conf, tableDescriptor, familyDescriptor, regionFs, parent); + if (load.archived) { + contributions.add(new ParentContribution(parent, ParentContribution.Status.ARCHIVED, 0)); + } else if (load.hfiles.isEmpty()) { + contributions.add( + new ParentContribution(parent, ParentContribution.Status.PRESENT_NO_FILES, 0)); + } else { + contributions.add(new ParentContribution(parent, + ParentContribution.Status.PRESENT_WITH_FILES, load.hfiles.size())); + } + } + return contributions; + } + + /** + * Returns the parent region's real on-disk HFiles only (reference files, link files, MOB link + * files etc. are excluded, as they do not represent unarchived parent data). The returned + * {@link ParentLoadResult#archived} flag indicates whether the parent region directory was not + * found (i.e. the Catalog Janitor archived it). + */ + private static ParentLoadResult loadParentHFilesOnly(Configuration conf, + TableDescriptor tableDescriptor, ColumnFamilyDescriptor familyDescriptor, + HRegionFileSystem childRegionFs, RegionInfo parentRegion) throws IOException { + // Explicitly check whether the parent region directory exists. openRegionFromFileSystem + // with readOnly=true may silently succeed even for a missing directory, deferring the + // failure to a later listStatus call that surfaces as an empty result rather than FNF. + FileSystem fs = childRegionFs.getFileSystem(); + Path parentRegionDir = new Path(childRegionFs.getTableDir(), parentRegion.getEncodedName()); + if (!fs.exists(parentRegionDir)) { + LOG.info("Parent region directory not found for {}; treating as archived/missing.", + parentRegion.getEncodedName()); + return new ParentLoadResult(Collections.emptyList(), true); + } + HRegionFileSystem parentRegionFs; + try { + parentRegionFs = HRegionFileSystem.openRegionFromFileSystem(conf, fs, + childRegionFs.getTableDir(), parentRegion, true); + } catch (FileNotFoundException e) { + LOG.info("Parent region directory not found for {}; treating as archived/missing.", + parentRegion.getEncodedName()); + return new ParentLoadResult(Collections.emptyList(), true); + } catch (IOException e) { + LOG.warn("Failed to open parent region {}; skipping data-loss assessment for it.", + parentRegion.getEncodedName(), e); + return new ParentLoadResult(Collections.emptyList(), false); + } + List all = + loadStoreFilesFromDisk(conf, tableDescriptor, familyDescriptor, parentRegionFs); + List hfilesOnly = new ArrayList<>(all.size()); + for (StoreFileInfo info : all) { + if (info.isReference() || HFileLink.isHFileLink(info.getPath().getName())) { + LOG.debug("Skipping non-HFile entry {} in parent {} during data-loss assessment.", + info.getPath().getName(), parentRegion.getEncodedName()); + continue; + } + hfilesOnly.add(info); + } + return new ParentLoadResult(hfilesOnly, false); + } + + private static StoreFileList.Builder + toStoreFileListBuilder(Collection storeFiles) { + StoreFileList.Builder builder = StoreFileList.newBuilder(); + for (StoreFileInfo info : storeFiles) { + StoreFileEntry.Builder entry = + StoreFileEntry.newBuilder().setName(info.getPath().getName()).setSize(info.getSize()); + if (info.isReference()) { + FSProtos.Reference reference = FSProtos.Reference.newBuilder() + .setSplitkey(ByteString.copyFrom(info.getReference().getSplitKey())) + .setRange(info.getReference().convert().getRange()).build(); + entry.setReference(reference); + } + builder.addStoreFile(entry.build()); + } + return builder; + } +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListRecoverTool.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListRecoverTool.java new file mode 100644 index 000000000000..2699aac6b888 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/storefiletracker/StoreFileListRecoverTool.java @@ -0,0 +1,316 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver.storefiletracker; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.Collections; +import java.util.List; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HBaseInterfaceAudience; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.master.region.MasterRegionFactory; +import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; +import org.apache.hadoop.hbase.regionserver.StoreUtils; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.FSTableDescriptors; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.yetus.audience.InterfaceAudience; +import org.apache.yetus.audience.InterfaceStability; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLine; +import org.apache.hbase.thirdparty.org.apache.commons.cli.CommandLineParser; +import org.apache.hbase.thirdparty.org.apache.commons.cli.HelpFormatter; +import org.apache.hbase.thirdparty.org.apache.commons.cli.Option; +import org.apache.hbase.thirdparty.org.apache.commons.cli.Options; +import org.apache.hbase.thirdparty.org.apache.commons.cli.ParseException; +import org.apache.hbase.thirdparty.org.apache.commons.cli.PosixParser; + +/** + * Offline, operator-driven CLI to rebuild a corrupted FILE store-file-tracker manifest + * ({@code .filelist}) for a single store ({@code table + region + family}). + *

+ * This is the sole repair surface of the FSFT manifest-recover design (see + * {@code dev-support/design-docs/fsft-manifest-recover.md}). It is offline by design: there is no + * online/in-master path, because nothing in the master can truly fence a RegionServer away from the + * store directory while a manifest is being rewritten. The operator instead acknowledges, via + * {@code --region-offline}, that the target region is not hosted anywhere -- a real quiescence + * guarantee -- before any manifest is written. + *

+ * The recovered manifest is reconstructed purely from the store directory listing. For user-table + * regions, the tool additionally consults {@code hbase:meta} for split/merge parents and reports + * whether bringing the region online risks data loss (parents with unarchived HFiles) or not (all + * parents already archived by the Catalog Janitor). All the logic lives in + * {@link StoreFileListRecover}; this class is only the CLI surface: argument parsing, safety + * acknowledgements, and report formatting. + */ +@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.TOOLS) +@InterfaceStability.Evolving +public class StoreFileListRecoverTool extends Configured implements Tool { + private static final Logger LOG = LoggerFactory.getLogger(StoreFileListRecoverTool.class); + + private final String tableNameOption = "t"; + private final String columnFamilyOption = "cf"; + private final String regionOption = "r"; + private final String dryRunOption = "dry-run"; + private final String forceMetaOption = "force-meta"; + private final String regionOfflineOption = "region-offline"; + + private final String cmdString = "sftrecover"; + + private final Options options = new Options(); + + private String regionName; + private String columnFamily; + private TableName targetTableName; + private boolean dryRun; + private boolean forceMeta; + private boolean regionOfflineAck; + + private PrintStream out = System.out; + private PrintStream err = System.err; + + public StoreFileListRecoverTool() { + super(); + init(); + } + + public StoreFileListRecoverTool(Configuration conf) { + super(conf); + init(); + } + + private void init() { + options.addOption(new Option(tableNameOption, "table", true, + "Table of the target store; e.g. test_table or ns:test_table")); + options.addOption(new Option(columnFamilyOption, "columnfamily", true, + "Column family of the target store; e.g. f")); + options.addOption(new Option(regionOption, "region", true, + "Encoded region name of the target store; e.g. '3d58e9067bf23e378e68c071f3dd39eb'")); + options.addOption(new Option(null, dryRunOption, false, + "Print the recover result without writing a new manifest")); + options.addOption(new Option(null, forceMetaOption, false, + "Allow recover against the hbase:meta table. Dangerous; only use with master offline.")); + options.addOption(new Option(null, regionOfflineOption, false, + "Operator acknowledgement that the target region is offline (no master/RS hosting it).")); + } + + private boolean parseOptions(String[] args) throws ParseException { + HelpFormatter formatter = new HelpFormatter(); + if (args.length == 0) { + formatter.printHelp(cmdString, options, true); + return false; + } + CommandLineParser parser = new PosixParser(); + CommandLine cmd = parser.parse(options, args); + + dryRun = cmd.hasOption(dryRunOption); + forceMeta = cmd.hasOption(forceMetaOption); + regionOfflineAck = cmd.hasOption(regionOfflineOption); + + regionName = cmd.getOptionValue(regionOption); + if (StringUtils.isEmpty(regionName)) { + err.println("Region name is not specified."); + formatter.printHelp(cmdString, options, true); + return false; + } + columnFamily = cmd.getOptionValue(columnFamilyOption); + if (StringUtils.isEmpty(columnFamily)) { + err.println("Column family is not specified."); + formatter.printHelp(cmdString, options, true); + return false; + } + String tableNameWithNS = cmd.getOptionValue(tableNameOption); + if (StringUtils.isEmpty(tableNameWithNS)) { + err.println("Table name is not specified."); + formatter.printHelp(cmdString, options, true); + return false; + } + targetTableName = TableName.valueOf(tableNameWithNS); + return true; + } + + @Override + public int run(String[] args) { + if (getConf() == null) { + throw new RuntimeException("A Configuration instance must be provided."); + } + try { + CommonFSUtils.setFsDefault(getConf(), CommonFSUtils.getRootDir(getConf())); + if (!parseOptions(args)) { + return 1; + } + } catch (IOException | ParseException ex) { + LOG.error("Error parsing command-line options", ex); + return 1; + } + try { + return recoverStoreFileList(); + } catch (IOException e) { + LOG.error("Error recovering store file list", e); + return 2; + } + } + + private int recoverStoreFileList() throws IOException { + if (!regionOfflineAck && !dryRun) { + err.println("ERROR, recover requires either --" + dryRunOption + " or --" + + regionOfflineOption + " to acknowledge the region is offline. Refusing to write a new" + + " manifest while the region may be online."); + return 2; + } + if (TableName.isMetaTableName(targetTableName) && !forceMeta) { + err.println("ERROR, refusing to recover hbase:meta without --" + forceMetaOption + + ". This is dangerous and only valid with the master offline."); + return 2; + } + Path root = CommonFSUtils.getRootDir(getConf()); + Path tablePath = CommonFSUtils.getTableDir(root, targetTableName); + Path regionPath = new Path(tablePath, regionName); + FileSystem fs = root.getFileSystem(getConf()); + TableDescriptor tableDescriptor = FSTableDescriptors.getTableDescriptorFromFs(fs, tablePath); + if (tableDescriptor == null) { + err.println("ERROR, unable to load table descriptor for " + targetTableName); + return 2; + } + ColumnFamilyDescriptor familyDescriptor = + tableDescriptor.getColumnFamily(Bytes.toBytes(columnFamily)); + if (familyDescriptor == null) { + err.println("ERROR, column family does not exist: " + columnFamily); + return 2; + } + String trackerName = StoreFileTrackerFactory.getStoreFileTrackerName( + StoreUtils.createStoreConfiguration(getConf(), tableDescriptor, familyDescriptor)); + if ( + !StoreFileTrackerFactory.Trackers.FILE.name().equalsIgnoreCase(trackerName) + && !StoreFileTrackerFactory.Trackers.MIGRATION.name().equalsIgnoreCase(trackerName) + ) { + err.println("ERROR, table " + targetTableName + " is not configured to use FILE store file" + + " tracker (current: " + trackerName + "). Refusing to write a manifest the runtime" + + " will not consult."); + return 2; + } + RegionInfo regionInfo = HRegionFileSystem.loadRegionInfoFileContent(fs, regionPath); + HRegionFileSystem regionFs = + HRegionFileSystem.openRegionFromFileSystem(getConf(), fs, tablePath, regionInfo, true); + + // Split/merge parent assessment is meaningful only for user-table regions. hbase:meta and + // master:store have no catalog lineage to consult, so skip the meta-walk for them. + List parents = Collections.emptyList(); + if ( + !TableName.isMetaTableName(targetTableName) + && !MasterRegionFactory.TABLE_NAME.equals(targetTableName) + ) { + try { + parents = StoreFileListRecover.resolveParents(getConf(), regionInfo); + } catch (IOException e) { + LOG.warn("Failed to resolve split/merge parents for {} from hbase:meta; the data-loss" + + " assessment will be skipped.", regionInfo.getEncodedName(), e); + parents = Collections.emptyList(); + } + } + + StoreFileListRecover.RecoverReport report = StoreFileListRecover.recover(getConf(), + tableDescriptor, familyDescriptor, regionFs, parents, dryRun); + printRecoverReport(report); + return 0; + } + + private void printRecoverReport(StoreFileListRecover.RecoverReport report) { + out.println("Dry run: " + dryRun); + for (StoreFileListRecover.TrackerFileDiagnostic diagnostic : report.getDiagnostics()) { + if (diagnostic.getError() == null) { + out.println("Tracker file " + diagnostic.getPath() + " loaded with " + + diagnostic.getStoreFileCount() + " entries"); + } else { + out.println( + "Tracker file " + diagnostic.getPath() + " is corrupted: " + diagnostic.getError()); + } + } + out.println("Manifest entries (rebuilt from disk): " + report.getManifestEntries().size()); + + // Per-parent on-disk status and data-loss verdict. + if (!report.getParentContributions().isEmpty()) { + out.println("--- Split/merge parent assessment ---"); + for (StoreFileListRecover.ParentContribution pc : report.getParentContributions()) { + String parentName = pc.getParent().getEncodedName(); + switch (pc.getStatus()) { + case ARCHIVED: + out.println(" Parent " + parentName + ": ARCHIVED (directory not found)."); + break; + case PRESENT_WITH_FILES: + out.println(" Parent " + parentName + ": PRESENT, " + pc.getUnarchivedHFileCount() + + " unarchived HFile(s)."); + break; + case PRESENT_NO_FILES: + out.println(" Parent " + parentName + ": PRESENT, no unarchived HFiles."); + break; + default: + break; + } + } + if (report.hasUnarchivedParents()) { + out.println("POTENTIAL DATA LOSS: one or more split/merge parents still have unarchived" + + " HFiles. The Catalog Janitor had not finished propagating parent data to this region" + + " when the manifest was lost. The disk-only manifest may be missing rows. Manual data" + + " recovery may be required -- review the parent regions before bringing this region" + + " online."); + } else if (report.allParentsArchived()) { + out.println("LIKELY NO DATA LOSS: all split/merge parent directories are missing, which is" + + " inferred to mean the Catalog Janitor archived them after their data was compacted" + + " into this region. NOTE: a missing directory is not by itself proof the data was" + + " archived (the same symptom occurs if a parent dir was lost before archival). If in" + + " doubt, verify the parents' HFiles exist under the archive before relying on the" + + " disk-only manifest."); + } else { + out.println("NO DATA LOSS: split/merge parents are present but carry no unarchived HFiles." + + " The disk-only manifest is authoritative."); + } + } + + if (dryRun) { + out.println("Dry-run completed. No new manifest was written."); + } else if (report.isNoOp()) { + out.println( + "No recover needed: existing tracker file already matches the recomputed manifest."); + } else if (report.getWrittenManifest() != null) { + out.println("Wrote recovered manifest to " + report.getWrittenManifest()); + } else { + out.println("WARNING: recover did not write a manifest and was not a dry-run; this is" + + " unexpected and may indicate a bug."); + } + } + + public static void main(String[] args) throws Exception { + Configuration conf = HBaseConfiguration.create(); + int ret = ToolRunner.run(conf, new StoreFileListRecoverTool(), args); + System.exit(ret); + } +} diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/storefiletracker/TestStoreFileListRecover.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/storefiletracker/TestStoreFileListRecover.java new file mode 100644 index 000000000000..772629b31a8c --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/storefiletracker/TestStoreFileListRecover.java @@ -0,0 +1,459 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.regionserver.storefiletracker; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseCommonTestingUtil; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.client.TableDescriptorBuilder; +import org.apache.hadoop.hbase.io.Reference; +import org.apache.hadoop.hbase.regionserver.HRegionFileSystem; +import org.apache.hadoop.hbase.testclassification.RegionServerTests; +import org.apache.hadoop.hbase.testclassification.SmallTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hbase.util.HFileTestUtil; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; + +import org.apache.hadoop.hbase.shaded.protobuf.generated.FSProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.StoreFileTrackerProtos.StoreFileEntry; +import org.apache.hadoop.hbase.shaded.protobuf.generated.StoreFileTrackerProtos.StoreFileList; + +@Tag(RegionServerTests.TAG) +@Tag(SmallTests.TAG) +public class TestStoreFileListRecover { + + private static final HBaseCommonTestingUtil UTIL = new HBaseCommonTestingUtil(); + private static final byte[] FAMILY = Bytes.toBytes("f"); + private static final byte[] QUALIFIER = Bytes.toBytes("q"); + private static final String FAMILY_NAME = Bytes.toString(FAMILY); + private static final TableName TABLE_NAME = TableName.valueOf("ns:tbl"); + + private FileSystem fs; + private Path rootDir; + private Path tableDir; + private TableDescriptor tableDescriptor; + private ColumnFamilyDescriptor familyDescriptor; + + @BeforeEach + public void setUp(TestInfo testInfo) throws IOException { + fs = FileSystem.get(UTIL.getConfiguration()); + rootDir = UTIL.getDataTestDir(testInfo.getTestMethod().get().getName()); + tableDir = CommonFSUtils.getTableDir(rootDir, TABLE_NAME); + fs.mkdirs(tableDir); + familyDescriptor = ColumnFamilyDescriptorBuilder.of(FAMILY); + tableDescriptor = + TableDescriptorBuilder.newBuilder(TABLE_NAME).setColumnFamily(familyDescriptor).build(); + } + + @AfterAll + public static void tearDown() { + UTIL.cleanupTestDir(); + } + + @Test + public void testCorruptedManifestIsDiagnosedAndReplaced() throws Exception { + RegionInfo regionInfo = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(1L).build(); + HRegionFileSystem regionFs = createRegion(regionInfo); + Path familyDir = regionFs.getStoreDir(FAMILY_NAME); + Path hfile = new Path(familyDir, "abcdef01"); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, hfile, FAMILY, QUALIFIER, + Bytes.toBytes("a"), Bytes.toBytes("z"), 10); + Path corrupt = writeCorruptTracker(regionFs, "f1.1"); + + StoreFileListRecover.RecoverReport report = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), false); + + // Diagnostics must mention the corrupted file. + assertTrue(report.hasCorruption(), "expected diagnostics to surface the corrupted file"); + assertTrue( + report.getDiagnostics().stream() + .anyMatch(d -> d.isCorrupted() && d.getPath().getName().equals(corrupt.getName())), + "corrupted file should be reported by name"); + + assertEquals(1, report.getManifestEntries().size()); + assertNotNull(report.getWrittenManifest()); + + StoreFileList recovered = StoreFileListFile.load(fs, report.getWrittenManifest()); + assertEquals(1, recovered.getStoreFileCount()); + assertEquals("abcdef01", recovered.getStoreFile(0).getName()); + + // The recovered manifest must have a strictly newer seqId than the corrupted file. + long corruptSeqId = parseSeqId(corrupt); + long recoveredSeqId = parseSeqId(report.getWrittenManifest()); + assertTrue(recoveredSeqId > corruptSeqId, + "recovered seqId " + recoveredSeqId + " should be > corrupted " + corruptSeqId); + } + + @Test + public void testNoParentsIsDiskOnly() throws Exception { + RegionInfo regionInfo = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(2L).build(); + HRegionFileSystem regionFs = createRegion(regionInfo); + Path hfile = new Path(regionFs.getStoreDir(FAMILY_NAME), "abcdef02"); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, hfile, FAMILY, QUALIFIER, + Bytes.toBytes("a"), Bytes.toBytes("z"), 10); + + StoreFileListRecover.RecoverReport report = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), false); + + assertEquals(1, report.getManifestEntries().size()); + assertTrue(report.getParentContributions().isEmpty(), + "no parents passed -> no parent assessment"); + StoreFileList recovered = StoreFileListFile.load(fs, report.getWrittenManifest()); + assertEquals(1, recovered.getStoreFileCount()); + assertEquals("abcdef02", recovered.getStoreFile(0).getName()); + } + + @Test + public void testArchivedParentReportsNoDataLoss() throws Exception { + RegionInfo parent = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(51L) + .setStartKey(Bytes.toBytes("")).setEndKey(Bytes.toBytes("")).build(); + HRegionFileSystem parentFs = createRegion(parent); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, + new Path(parentFs.getStoreDir(FAMILY_NAME), "abcdef50"), FAMILY, QUALIFIER, + Bytes.toBytes("a"), Bytes.toBytes("z"), 10); + + RegionInfo topChild = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(52L) + .setStartKey(Bytes.toBytes("m")).setEndKey(Bytes.toBytes("")).build(); + // Child has its own already-compacted-in HFile. + HRegionFileSystem childFs = createRegion(topChild); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, + new Path(childFs.getStoreDir(FAMILY_NAME), "abcdef59"), FAMILY, QUALIFIER, Bytes.toBytes("m"), + Bytes.toBytes("z"), 10); + + // Simulate Catalog Janitor having archived (deleted) the parent's region directory. + Path parentRegionDir = new Path(tableDir, parent.getEncodedName()); + assertTrue(fs.exists(parentRegionDir), "test setup: parent dir should exist"); + assertTrue(fs.delete(parentRegionDir, true), "delete parent dir to simulate archive"); + + StoreFileListRecover.RecoverReport report = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, childFs, + Collections.singletonList(parent), false); + + // Manifest is rebuilt from the child's own disk files; the parent never contributes entries. + assertEquals(1, report.getManifestEntries().size(), + "manifest is disk-only and must contain only the child's own HFile"); + assertEquals("abcdef59", report.getManifestEntries().get(0).getPath().getName()); + + // Parent contribution is reported as ARCHIVED -> no data loss. + assertEquals(1, report.getParentContributions().size()); + StoreFileListRecover.ParentContribution pc = report.getParentContributions().get(0); + assertEquals(parent.getEncodedName(), pc.getParent().getEncodedName()); + assertEquals(StoreFileListRecover.ParentContribution.Status.ARCHIVED, pc.getStatus()); + assertEquals(0, pc.getUnarchivedHFileCount()); + assertTrue(report.allParentsArchived(), + "allParentsArchived should be true when parent is archived"); + assertFalse(report.hasUnarchivedParents(), + "hasUnarchivedParents should be false when parent is archived"); + } + + @Test + public void testUnarchivedParentReportsPotentialDataLoss() throws Exception { + // Split parent is still present on disk with HFiles -> potential data loss. + RegionInfo parent = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(53L) + .setStartKey(Bytes.toBytes("")).setEndKey(Bytes.toBytes("")).build(); + HRegionFileSystem parentFs = createRegion(parent); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, + new Path(parentFs.getStoreDir(FAMILY_NAME), "abcdef55"), FAMILY, QUALIFIER, + Bytes.toBytes("a"), Bytes.toBytes("z"), 10); + + RegionInfo topChild = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(54L) + .setStartKey(Bytes.toBytes("m")).setEndKey(Bytes.toBytes("")).build(); + HRegionFileSystem childFs = createRegion(topChild); + + StoreFileListRecover.RecoverReport report = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, childFs, + Collections.singletonList(parent), false); + + // The manifest is still disk-only: the unarchived parent does NOT inject entries. + assertEquals(0, report.getManifestEntries().size(), + "manifest must remain disk-only; parent files are never injected"); + + // Parent contribution should be PRESENT_WITH_FILES -> potential data loss. + assertEquals(1, report.getParentContributions().size()); + StoreFileListRecover.ParentContribution pc = report.getParentContributions().get(0); + assertEquals(parent.getEncodedName(), pc.getParent().getEncodedName()); + assertEquals(StoreFileListRecover.ParentContribution.Status.PRESENT_WITH_FILES, pc.getStatus()); + assertTrue(pc.getUnarchivedHFileCount() > 0, "unarchived HFile count should be > 0"); + assertFalse(report.allParentsArchived(), + "allParentsArchived should be false when parent has files"); + assertTrue(report.hasUnarchivedParents(), + "hasUnarchivedParents should be true when parent has files"); + } + + @Test + public void testMergeWithMixedArchiveStatus() throws Exception { + // Two merge parents: one archived, one still present with files. + RegionInfo mergeParentA = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(55L) + .setStartKey(Bytes.toBytes("")).setEndKey(Bytes.toBytes("m")).build(); + RegionInfo mergeParentB = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(56L) + .setStartKey(Bytes.toBytes("m")).setEndKey(Bytes.toBytes("")).build(); + HRegionFileSystem parentAFs = createRegion(mergeParentA); + HRegionFileSystem parentBFs = createRegion(mergeParentB); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, + new Path(parentAFs.getStoreDir(FAMILY_NAME), "abcdef56"), FAMILY, QUALIFIER, + Bytes.toBytes("a"), Bytes.toBytes("l"), 10); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, + new Path(parentBFs.getStoreDir(FAMILY_NAME), "abcdef57"), FAMILY, QUALIFIER, + Bytes.toBytes("m"), Bytes.toBytes("z"), 10); + + // Delete parent A to simulate archival. + Path parentADir = new Path(tableDir, mergeParentA.getEncodedName()); + assertTrue(fs.delete(parentADir, true), "delete parent A to simulate archive"); + + RegionInfo mergedChild = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(57L) + .setStartKey(Bytes.toBytes("")).setEndKey(Bytes.toBytes("")).build(); + HRegionFileSystem childFs = createRegion(mergedChild); + + StoreFileListRecover.RecoverReport report = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, childFs, + Arrays.asList(mergeParentA, mergeParentB), false); + + // Two parent contributions: one ARCHIVED, one PRESENT_WITH_FILES. + assertEquals(2, report.getParentContributions().size()); + StoreFileListRecover.ParentContribution pcA = report.getParentContributions().stream() + .filter(pc -> pc.getParent().getEncodedName().equals(mergeParentA.getEncodedName())) + .findFirst().orElse(null); + StoreFileListRecover.ParentContribution pcB = report.getParentContributions().stream() + .filter(pc -> pc.getParent().getEncodedName().equals(mergeParentB.getEncodedName())) + .findFirst().orElse(null); + assertNotNull(pcA, "parent A contribution must be present"); + assertNotNull(pcB, "parent B contribution must be present"); + assertEquals(StoreFileListRecover.ParentContribution.Status.ARCHIVED, pcA.getStatus()); + assertEquals(StoreFileListRecover.ParentContribution.Status.PRESENT_WITH_FILES, + pcB.getStatus()); + assertFalse(report.allParentsArchived(), "allParentsArchived should be false (mixed status)"); + assertTrue(report.hasUnarchivedParents(), + "hasUnarchivedParents should be true (parent B has files)"); + } + + @Test + public void testDryRunDoesNotWriteManifest() throws Exception { + RegionInfo regionInfo = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(8L).build(); + HRegionFileSystem regionFs = createRegion(regionInfo); + Path hfile = new Path(regionFs.getStoreDir(FAMILY_NAME), "abcdef30"); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, hfile, FAMILY, QUALIFIER, + Bytes.toBytes("a"), Bytes.toBytes("z"), 10); + Path corrupt = writeCorruptTracker(regionFs, "f1.1"); + + StoreFileListRecover.RecoverReport report = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), true); + + assertNull(report.getWrittenManifest(), "dry-run must not write a new manifest"); + assertTrue(fs.exists(corrupt), "corrupted tracker file must remain after dry-run"); + Path trackDir = new Path(regionFs.getStoreDir(FAMILY_NAME), StoreFileListFile.TRACK_FILE_DIR); + // Only the corrupt file should be in the track dir, no new f1/f2 should have been created. + int count = 0; + for (org.apache.hadoop.fs.FileStatus s : fs.listStatus(trackDir)) { + assertEquals(corrupt.getName(), s.getPath().getName()); + count++; + } + assertEquals(1, count); + } + + @Test + public void testNoOpWhenManifestAlreadyMatchesDisk() throws Exception { + // First, write a healthy manifest by running recover against a non-corrupted store. + RegionInfo regionInfo = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(9L).build(); + HRegionFileSystem regionFs = createRegion(regionInfo); + Path hfile = new Path(regionFs.getStoreDir(FAMILY_NAME), "abcdef60"); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, hfile, FAMILY, QUALIFIER, + Bytes.toBytes("a"), Bytes.toBytes("z"), 10); + + StoreFileListRecover.RecoverReport first = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), false); + assertNotNull(first.getWrittenManifest()); + assertFalse(first.isNoOp()); + + // Run again. There is no corruption and the manifest matches disk; should be a no-op. + StoreFileListRecover.RecoverReport second = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), false); + assertTrue(second.isNoOp(), "second recover should be a no-op"); + assertNull(second.getWrittenManifest(), "no new manifest should have been written"); + } + + @Test + public void testCorruptHighestSeqIdIsNotMaskedByHealthyOlderFile() throws Exception { + // Write a healthy manifest first. + RegionInfo regionInfo = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(10L).build(); + HRegionFileSystem regionFs = createRegion(regionInfo); + Path hfile = new Path(regionFs.getStoreDir(FAMILY_NAME), "abcdef70"); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, hfile, FAMILY, QUALIFIER, + Bytes.toBytes("a"), Bytes.toBytes("z"), 10); + StoreFileListRecover.RecoverReport first = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), false); + long healthySeqId = parseSeqId(first.getWrittenManifest()); + + // Now plant a corrupted tracker file with a *higher* seqId than the healthy generation. The + // runtime load(false) visits the highest seqId first, so this corruption would fail region open + // even though the older healthy file matches disk. Recovery must NOT treat this as a no-op. + long corruptSeqId = healthySeqId + 1_000_000L; + Path corrupt = writeCorruptTracker(regionFs, "f2." + corruptSeqId); + + StoreFileListRecover.RecoverReport second = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), false); + + assertTrue(second.hasCorruption(), "the higher-seqId corruption must be surfaced"); + assertFalse(second.isNoOp(), + "recovery must not no-op when a corrupt file outranks the healthy generation"); + assertNotNull(second.getWrittenManifest(), "a fresh generation must be written"); + long recoveredSeqId = parseSeqId(second.getWrittenManifest()); + assertTrue(recoveredSeqId > corruptSeqId, "recovered seqId " + recoveredSeqId + + " must outrank the corrupt file " + corruptSeqId); + assertTrue(fs.exists(corrupt), "corrupt file is left in place; pruned on next load(false)"); + } + + @Test + public void testCorruptOlderFileDoesNotBlockNoOp() throws Exception { + // A healthy manifest plus a corrupted file with a *lower* seqId: the runtime would never reach + // the corrupt file, so the store is effectively healthy and recovery should no-op. + RegionInfo regionInfo = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(11L).build(); + HRegionFileSystem regionFs = createRegion(regionInfo); + Path hfile = new Path(regionFs.getStoreDir(FAMILY_NAME), "abcdef71"); + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, hfile, FAMILY, QUALIFIER, + Bytes.toBytes("a"), Bytes.toBytes("z"), 10); + StoreFileListRecover.RecoverReport first = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), false); + assertNotNull(first.getWrittenManifest()); + + // Plant a corrupt file whose numeric seqId is below the healthy generation's. + writeCorruptTracker(regionFs, "f1.1"); + + StoreFileListRecover.RecoverReport second = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), false); + + assertTrue(second.isNoOp(), + "a lower-seqId corrupt file the runtime never reaches must not force a rewrite"); + assertNull(second.getWrittenManifest()); + } + + @Test + public void testReferenceFilePreservedInRecoveredManifest() throws Exception { + RegionInfo regionInfo = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(12L).build(); + HRegionFileSystem regionFs = createRegion(regionInfo); + Path familyDir = regionFs.getStoreDir(FAMILY_NAME); + // A plain HFile plus a TOP split-reference file physically present on disk. + HFileTestUtil.createHFile(UTIL.getConfiguration(), fs, new Path(familyDir, "abcdef80"), FAMILY, + QUALIFIER, Bytes.toBytes("a"), Bytes.toBytes("z"), 10); + byte[] splitRow = Bytes.toBytes("split-row-key"); + String refName = "abcdef81.0123456789abcdef0123456789abcde0"; + Reference original = Reference.createTopReference(splitRow); + original.write(fs, new Path(familyDir, refName)); + + StoreFileListRecover.RecoverReport report = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, regionFs, + Collections.emptyList(), false); + + assertEquals(2, report.getManifestEntries().size(), + "both the HFile and reference are recorded"); + StoreFileList recovered = StoreFileListFile.load(fs, report.getWrittenManifest()); + StoreFileEntry refEntry = recovered.getStoreFileList().stream() + .filter(e -> e.getName().equals(refName)).findFirst().orElse(null); + assertNotNull(refEntry, "the reference entry must be in the recovered manifest"); + assertTrue(refEntry.hasReference(), "reference entry must carry a Reference body"); + assertEquals(FSProtos.Reference.Range.TOP, refEntry.getReference().getRange()); + // The Reference body (range + encoded split key) must round-trip faithfully. + Reference roundTripped = Reference.convert(refEntry.getReference()); + assertEquals(0, Bytes.compareTo(original.getSplitKey(), roundTripped.getSplitKey()), + "the encoded split key must round-trip through the recovered manifest"); + } + + @Test + public void testPresentParentWithOnlyReferenceReportsNoDataLoss() throws Exception { + // Parent directory exists but its only store file is a reference (not unarchived parent data). + RegionInfo parent = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(58L) + .setStartKey(Bytes.toBytes("")).setEndKey(Bytes.toBytes("")).build(); + HRegionFileSystem parentFs = createRegion(parent); + byte[] splitRow = Bytes.toBytes("p"); + Reference.createBottomReference(splitRow).write(fs, + new Path(parentFs.getStoreDir(FAMILY_NAME), "abcdef90.0123456789abcdef0123456789abcde1")); + + RegionInfo child = RegionInfoBuilder.newBuilder(TABLE_NAME).setRegionId(59L) + .setStartKey(Bytes.toBytes("m")).setEndKey(Bytes.toBytes("")).build(); + HRegionFileSystem childFs = createRegion(child); + + StoreFileListRecover.RecoverReport report = StoreFileListRecover.recover( + UTIL.getConfiguration(), tableDescriptor, familyDescriptor, childFs, + Collections.singletonList(parent), false); + + assertEquals(1, report.getParentContributions().size()); + StoreFileListRecover.ParentContribution pc = report.getParentContributions().get(0); + assertEquals(StoreFileListRecover.ParentContribution.Status.PRESENT_NO_FILES, pc.getStatus()); + assertEquals(0, pc.getUnarchivedHFileCount(), + "a reference file does not count as unarchived parent data"); + assertFalse(report.allParentsArchived()); + assertFalse(report.hasUnarchivedParents(), + "present-but-no-files parent must not raise a data-loss flag"); + } + + private HRegionFileSystem createRegion(RegionInfo regionInfo) throws IOException { + HRegionFileSystem regionFs = + HRegionFileSystem.create(UTIL.getConfiguration(), fs, tableDir, regionInfo); + fs.mkdirs(regionFs.getStoreDir(FAMILY_NAME)); + return regionFs; + } + + private Path writeCorruptTracker(HRegionFileSystem regionFs, String fileName) throws IOException { + Path trackDir = new Path(regionFs.getStoreDir(FAMILY_NAME), StoreFileListFile.TRACK_FILE_DIR); + fs.mkdirs(trackDir); + Path file = new Path(trackDir, fileName); + try (FSDataOutputStream out = fs.create(file, true)) { + // Write an inconsistent length+payload+checksum so load() throws an IOException + // (the checksum will not match), exercising the corruption diagnostic path. + out.writeInt(8); + out.writeLong(1L); + out.writeInt(0xdeadbeef); + } + return file; + } + + private static long parseSeqId(Path file) { + String n = file.getName(); + int dot = n.indexOf('.'); + return dot < 0 ? 0L : Long.parseLong(n.substring(dot + 1)); + } +}