ManifestVerifier.java

/*
 * Copyright (C) 2023 DANS - Data Archiving and Networked Services (info@dans.knaw.nl)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package nl.knaw.dans.bagit.verify;

import java.io.IOException;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashSet;
import java.util.ResourceBundle;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import nl.knaw.dans.bagit.domain.Bag;
import nl.knaw.dans.bagit.exceptions.FileNotInPayloadDirectoryException;
import nl.knaw.dans.bagit.exceptions.InvalidBagitFileFormatException;
import nl.knaw.dans.bagit.exceptions.MaliciousPathException;
import nl.knaw.dans.bagit.exceptions.UnsupportedAlgorithmException;
import nl.knaw.dans.bagit.reader.ManifestReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.helpers.MessageFormatter;

import nl.knaw.dans.bagit.domain.Manifest;
import nl.knaw.dans.bagit.domain.Version;
import nl.knaw.dans.bagit.hash.BagitAlgorithmNameToSupportedAlgorithmMapping;
import nl.knaw.dans.bagit.hash.StandardBagitAlgorithmNameToSupportedAlgorithmMapping;
import nl.knaw.dans.bagit.util.PathUtils;

/**
 * Responsible for all things related to the manifest during verification.
 */
public class ManifestVerifier implements AutoCloseable{
  private static final Logger logger = LoggerFactory.getLogger(ManifestVerifier.class);
  private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");

  private transient final BagitAlgorithmNameToSupportedAlgorithmMapping nameMapping;
  private transient final ExecutorService executor;
  
  /**
   * Create a PayloadVerifier using a cached thread pool and the 
   * {@link StandardBagitAlgorithmNameToSupportedAlgorithmMapping} mapping
   */
  public ManifestVerifier(){
    this(new StandardBagitAlgorithmNameToSupportedAlgorithmMapping(), Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()));
  }

  /**
   * Create a PayloadVerifier using a cached thread pool and a custom mapping
   * 
   * @param nameMapping the mapping between BagIt algorithm name and the java supported algorithm
   */
  public ManifestVerifier(final BagitAlgorithmNameToSupportedAlgorithmMapping nameMapping) {
    this(nameMapping, Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()));
  }
  
  /**
   * Create a PayloadVerifier using a custom thread pool and the 
   * {@link StandardBagitAlgorithmNameToSupportedAlgorithmMapping} mapping
   * 
   * @param executor the thread pool to use when doing work
   */
  public ManifestVerifier(final ExecutorService executor) {
    this(new StandardBagitAlgorithmNameToSupportedAlgorithmMapping(), executor);
  }
  
  /**
   * Create a PayloadVerifier using a custom thread pool and a custom mapping
   * 
   * @param nameMapping the mapping between BagIt algorithm name and the java supported algorithm
   * @param executor the thread pool to use when doing work
   */
  public ManifestVerifier(final BagitAlgorithmNameToSupportedAlgorithmMapping nameMapping, final ExecutorService executor) {
    this.nameMapping = nameMapping;
    this.executor = executor;
  }
  
  @Override
  public void close() throws SecurityException{
    //shutdown the thread pool so the resource isn't leaked
    executor.shutdown();
  }

  /**
   * Verify that all the files in the payload directory are listed in the payload manifest and 
   * all files listed in all manifests exist.
   * 
   * @param bag the bag to check to check
   * @param ignoreHiddenFiles to ignore hidden files unless they are specifically listed in a manifest
   * 
   * @throws IOException if there is a problem reading a file
   * @throws MaliciousPathException the path in the manifest was specifically crafted to cause harm
   * @throws UnsupportedAlgorithmException if the algorithm used for the manifest is unsupported
   * @throws InvalidBagitFileFormatException if any of the manifests don't conform to the bagit specification
   * @throws FileNotInPayloadDirectoryException if a file is listed in a manifest but doesn't exist in the payload directory
   * @throws InterruptedException if a thread is interrupted while doing work
   */
  public void verifyManifests(final Bag bag, final boolean ignoreHiddenFiles)
      throws IOException, MaliciousPathException, UnsupportedAlgorithmException, 
      InvalidBagitFileFormatException, FileNotInPayloadDirectoryException, InterruptedException {
    
    final Set<Path> allFilesListedInManifests = getAllFilesListedInManifests(bag);
    checkAllFilesListedInManifestExist(allFilesListedInManifests);

    if (bag.getVersion().isOlder(new Version(1, 0))) {
      checkAllFilesInPayloadDirAreListedInAtLeastOneAManifest(allFilesListedInManifests, PathUtils.getDataDir(bag), ignoreHiddenFiles);
    } else {
      CheckAllFilesInPayloadDirAreListedInAllManifests(bag.getPayLoadManifests(), PathUtils.getDataDir(bag), ignoreHiddenFiles);
    }
  }

  /*
   * get all the files listed in all the manifests
   */
  private Set<Path> getAllFilesListedInManifests(final Bag bag)
      throws IOException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException {
    logger.debug(messages.getString("all_files_in_manifests"));
    final Set<Path> filesListedInManifests = new HashSet<>();

    try(DirectoryStream<Path> directoryStream = 
        Files.newDirectoryStream(PathUtils.getBagitDir(bag.getVersion(), bag.getRootDir()))){
      for (final Path path : directoryStream) {
        final String filename = PathUtils.getFilename(path);
        if (filename.startsWith("tagmanifest-") || filename.startsWith("manifest-")) {
          logger.debug(messages.getString("get_listing_in_manifest"), path);
          final Manifest manifest = ManifestReader.readManifest(nameMapping, path, bag.getRootDir(),
              bag.getFileEncoding());
          filesListedInManifests.addAll(manifest.getFileToChecksumMap().keySet());
        }
      }
    }

    return filesListedInManifests;
  }

  /*
   * Make sure all the listed files actually exist
   */
  @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
  private void checkAllFilesListedInManifestExist(final Set<Path> files) throws FileNotInPayloadDirectoryException, InterruptedException {
    final CountDownLatch latch = new CountDownLatch(files.size());
    final Set<Path> missingFiles = new ConcurrentSkipListSet<>();

    logger.info(messages.getString("check_all_files_in_manifests_exist"));
    for (final Path file : files) {
      executor.execute(new CheckIfFileExistsTask(file, missingFiles, latch));
    }

    latch.await();

    if (!missingFiles.isEmpty()) {
      final String formattedMessage = messages.getString("missing_payload_files_error");
      throw new FileNotInPayloadDirectoryException(MessageFormatter.format(formattedMessage, missingFiles).getMessage());
    }
  }

  /*
   * Make sure all files in the directory are in at least 1 manifest
   */
  private static void checkAllFilesInPayloadDirAreListedInAtLeastOneAManifest(final Set<Path> filesListedInManifests,
      final Path payloadDir, final boolean ignoreHiddenFiles) throws IOException {
    logger.debug(messages.getString("checking_file_in_at_least_one_manifest"), payloadDir);
    if (Files.exists(payloadDir)) {
      Files.walkFileTree(payloadDir,
          new PayloadFileExistsInAtLeastOneManifestVistor(filesListedInManifests, ignoreHiddenFiles));
    }
  }

  /*
   * as per the bagit-spec 1.0+ all files have to be listed in all manifests
   */
  private static void CheckAllFilesInPayloadDirAreListedInAllManifests(final Set<Manifest> payLoadManifests,
      final Path payloadDir, final boolean ignoreHiddenFiles) throws IOException {
    logger.debug(messages.getString("checking_file_in_all_manifests"), payloadDir);
    if (Files.exists(payloadDir)) {
      Files.walkFileTree(payloadDir, new PayloadFileExistsInAllManifestsVistor(payLoadManifests, ignoreHiddenFiles));
    }
  }
}