BagVerifier.java

/*
 * Copyright (C) 2023 DANS - Data Archiving and Networked Services (info@dans.knaw.nl)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package nl.knaw.dans.bagit.verify;

import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Map.Entry;
import java.util.ResourceBundle;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import nl.knaw.dans.bagit.domain.Bag;
import nl.knaw.dans.bagit.domain.Manifest;
import nl.knaw.dans.bagit.exceptions.CorruptChecksumException;
import nl.knaw.dans.bagit.exceptions.FileNotInManifestException;
import nl.knaw.dans.bagit.exceptions.FileNotInPayloadDirectoryException;
import nl.knaw.dans.bagit.exceptions.InvalidBagitFileFormatException;
import nl.knaw.dans.bagit.exceptions.InvalidPayloadOxumException;
import nl.knaw.dans.bagit.exceptions.MaliciousPathException;
import nl.knaw.dans.bagit.exceptions.MissingBagitFileException;
import nl.knaw.dans.bagit.exceptions.MissingPayloadDirectoryException;
import nl.knaw.dans.bagit.exceptions.MissingPayloadManifestException;
import nl.knaw.dans.bagit.exceptions.PayloadOxumDoesNotExistException;
import nl.knaw.dans.bagit.exceptions.UnsupportedAlgorithmException;
import nl.knaw.dans.bagit.exceptions.VerificationException;
import nl.knaw.dans.bagit.hash.BagitAlgorithmNameToSupportedAlgorithmMapping;
import nl.knaw.dans.bagit.hash.StandardBagitAlgorithmNameToSupportedAlgorithmMapping;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Responsible for verifying if a bag is valid, complete
 */
public final class BagVerifier implements AutoCloseable{
  private static final Logger logger = LoggerFactory.getLogger(BagVerifier.class);
  private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");
  
  private final ManifestVerifier manifestVerifier;
  private final ExecutorService executor;
  
  /**
   * Create a BagVerifier with a cached thread pool and a 
   * {@link StandardBagitAlgorithmNameToSupportedAlgorithmMapping}
   */
  public BagVerifier(){
    this(Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()), new StandardBagitAlgorithmNameToSupportedAlgorithmMapping());
  }
  
  /**
   * Create a BagVerifier with a cached thread pool and a custom mapping
   * 
   * @param nameMapping the mapping between BagIt algorithm name and the java supported algorithm
   */
  public BagVerifier(final BagitAlgorithmNameToSupportedAlgorithmMapping nameMapping){
    this(Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()), nameMapping);
  }
  
  /**
   * Create a BagVerifier with a custom thread pool and a 
   * {@link StandardBagitAlgorithmNameToSupportedAlgorithmMapping}
   * 
   * @param executor the thread pool to use when doing work
   */
  public BagVerifier(final ExecutorService executor){
    this(executor, new StandardBagitAlgorithmNameToSupportedAlgorithmMapping());
  }
  
  /**
   * Create a BagVerifier with a custom thread pool and a custom mapping
   * 
   * @param nameMapping the mapping between BagIt algorithm name and the java supported algorithm
   * @param executor the thread pool to use when doing work
   */
  public BagVerifier(final ExecutorService executor, final BagitAlgorithmNameToSupportedAlgorithmMapping nameMapping){
    manifestVerifier = new ManifestVerifier(nameMapping, executor);
    this.executor = executor;
  }
  
  @Override
  public void close() throws SecurityException{
    //shutdown the thread pool so the resource isn't leaked
    executor.shutdown();
    manifestVerifier.close();
  }
  
  /**
   * Determine if we can quickly verify by comparing the number of files and the total number of bytes expected
   * 
   * @param bag the {@link Bag} object you wish to check
   * @return true if the bag can be quickly verified
   */
  public static boolean canQuickVerify(final Bag bag){
    return QuickVerifier.canQuickVerify(bag);
  }
  
  /**
   * Quickly verify by comparing the number of files and the total number of bytes expected
   * 
   * @param bag the bag to verify by payload-oxum
   * 
   * @throws IOException if there is an error reading a file
   * @throws InvalidPayloadOxumException if either the total bytes or the number of files
   * calculated for the payload directory of the bag is different than the supplied values
   * @throws PayloadOxumDoesNotExistException if the bag does not contain a payload-oxum.
   * To check, run {@link BagVerifier#canQuickVerify}
   */
  public static void quicklyVerify(final Bag bag) throws IOException, InvalidPayloadOxumException{
    QuickVerifier.quicklyVerify(bag);
  }

  /**
   * See <a href="https://tools.ietf.org/html/draft-kunze-bagit-13#section-3">https://tools.ietf.org/html/draft-kunze-bagit-13#section-3</a><br>
   *  A bag is <b>valid</b> if the bag is complete and every checksum has been 
   *  verified against the contents of its corresponding file.
   * 
   * @param bag the {@link Bag} object to check
   * @param ignoreHiddenFiles ignore hidden files unless explicitly listed in manifest(s)
   * 
   * @throws CorruptChecksumException when the computed hash doesn't match given hash
   * @throws IOException if there was an error with the file
   * @throws FileNotInManifestException if a file is found in the payload directory but not in manifest(s)
   * @throws MissingPayloadManifestException if there is not at least one payload manifest
   * @throws MissingBagitFileException  if there is no bagit.txt file
   * @throws MissingPayloadDirectoryException if there is no /data directory
   * @throws FileNotInPayloadDirectoryException if a manifest lists a file but it is not in the payload directory
   * @throws InterruptedException if the threads are interrupted when checking if all files are listed in manifest(s)
   * @throws MaliciousPathException if there is path that is referenced in the manifest that is outside the bag root directory
   * @throws VerificationException some other exception happened during processing so capture it here.
   * @throws UnsupportedAlgorithmException if the manifest uses a algorithm that isn't supported
   * @throws InvalidBagitFileFormatException if the manifest is not formatted properly
   */
  public void isValid(final Bag bag, final boolean ignoreHiddenFiles) throws IOException, FileNotInManifestException, MissingPayloadManifestException, MissingBagitFileException, MissingPayloadDirectoryException, FileNotInPayloadDirectoryException, InterruptedException, MaliciousPathException, CorruptChecksumException, VerificationException, UnsupportedAlgorithmException, InvalidBagitFileFormatException{
    logger.info(messages.getString("checking_bag_is_valid"), bag.getRootDir());
    isComplete(bag, ignoreHiddenFiles);
    
    logger.debug(messages.getString("checking_payload_checksums"));
    for(final Manifest payloadManifest : bag.getPayLoadManifests()){
      checkHashes(payloadManifest);
    }
    
    logger.debug(messages.getString("checking_tag_file_checksums"));
    for(final Manifest tagManifest : bag.getTagManifests()){
      checkHashes(tagManifest);
    }
  }
  
  /*
   * Check the supplied checksum hashes against the generated checksum hashes
   */
  @SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
  void checkHashes(final Manifest manifest) throws CorruptChecksumException, InterruptedException, VerificationException{
    final CountDownLatch latch = new CountDownLatch( manifest.getFileToChecksumMap().size());
    
    //TODO maybe return all of these at some point...
    final Collection<Exception> exceptions = Collections.synchronizedCollection(new ArrayList<>());
    
    for(final Entry<Path, String> entry : manifest.getFileToChecksumMap().entrySet()){
      executor.execute(new CheckManifestHashesTask(entry, manifest.getAlgorithm().getMessageDigestName(), latch, exceptions));
    }
    
    latch.await();
    
    if(!exceptions.isEmpty()){
      final Exception e = exceptions.iterator().next();
      if(e instanceof CorruptChecksumException){
        logger.debug(messages.getString("checksums_not_matching_error"), exceptions.size());
        throw (CorruptChecksumException)e;
      }
      
      throw new VerificationException(e);
    }
  }
  
  /**
   * See <a href="https://tools.ietf.org/html/draft-kunze-bagit-13#section-3">https://tools.ietf.org/html/draft-kunze-bagit-13#section-3</a><br>
   * A bag is <b>complete</b> if <br>
   * <ul>
   * <li>every element is present
   * <li>every file in the payload manifest(s) are present
   * <li>every file in the tag manifest(s) are present. Tag files not listed in a tag manifest may be present.
   * <li>every file in the data directory must be listed in at least one payload manifest
   * <li>each element must comply with the bagit spec
   * </ul>
   * 
   * @param bag the {@link Bag} object to check
   * @param ignoreHiddenFiles ignore hidden files unless explicitly listed in manifest(s)
   * 
   * @throws IOException if there was an error with the file
   * @throws MissingPayloadManifestException if there is not at least one payload manifest
   * @throws MissingBagitFileException  if there is no bagit.txt file
   * @throws MissingPayloadDirectoryException if there is no /data directory
   * @throws FileNotInPayloadDirectoryException if a manifest lists a file but it is not in the payload directory
   * @throws InterruptedException if the threads are interrupted when checking if all files are listed in manifest(s)
   * @throws MaliciousPathException if there is path that is referenced in the manifest that is outside the bag root directory
   * @throws UnsupportedAlgorithmException if the manifest uses a algorithm that isn't supported
   * @throws InvalidBagitFileFormatException if the manifest is not formatted properly 
   */
  public void isComplete(final Bag bag, final boolean ignoreHiddenFiles) throws 
    IOException, MissingPayloadManifestException, MissingBagitFileException, MissingPayloadDirectoryException, 
    FileNotInPayloadDirectoryException, InterruptedException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException{
    logger.info(messages.getString("checking_bag_is_complete"), bag.getRootDir());
    
    MandatoryVerifier.checkFetchItemsExist(bag.getItemsToFetch(), bag.getRootDir());
    
    MandatoryVerifier.checkBagitFileExists(bag.getRootDir(), bag.getVersion());
    
    MandatoryVerifier.checkPayloadDirectoryExists(bag);
    
    MandatoryVerifier.checkIfAtLeastOnePayloadManifestsExist(bag.getRootDir(), bag.getVersion());
    
    manifestVerifier.verifyManifests(bag, ignoreHiddenFiles);
  }
  
  public ExecutorService getExecutor() {
    return executor;
  }

  public ManifestVerifier getManifestVerifier() {
    return manifestVerifier;
  }
}