ManifestChecker.java

/*
 * Copyright (C) 2023 DANS - Data Archiving and Networked Services (info@dans.knaw.nl)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package nl.knaw.dans.bagit.conformance;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.ResourceBundle;
import java.util.Set;

import nl.knaw.dans.bagit.domain.Manifest;
import nl.knaw.dans.bagit.exceptions.InvalidBagitFileFormatException;
import nl.knaw.dans.bagit.exceptions.MaliciousPathException;
import nl.knaw.dans.bagit.exceptions.UnsupportedAlgorithmException;
import nl.knaw.dans.bagit.hash.StandardBagitAlgorithmNameToSupportedAlgorithmMapping;
import nl.knaw.dans.bagit.reader.ManifestReader;
import nl.knaw.dans.bagit.util.PathUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.helpers.MessageFormatter;

import nl.knaw.dans.bagit.domain.Version;

/**
 * Part of the BagIt conformance suite. 
 * This checker checks for various problems related to the manifests in a bag.
 */
//TODO refactor to remove PMD warnings!
@SuppressWarnings({"PMD.UseLocaleWithCaseConversions", "PMD.TooManyMethods", "PMD.GodClass"})
public final class ManifestChecker {
  private static final Logger logger = LoggerFactory.getLogger(ManifestChecker.class);
  private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");
  
  private static final String THUMBS_DB_FILE = "[Tt][Hh][Uu][Mm][Bb][Ss]\\.[Dd][Bb]";
  private static final String DS_STORE_FILE = "\\.[Dd][Ss]_[Ss][Tt][Oo][Rr][Ee]";
  private static final String SPOTLIGHT_FILE = "\\.[Ss][Pp][Oo][Tt][Ll][Ii][Gg][Hh][Tt]-[Vv]100";
  private static final String TRASHES_FILE = "\\.(_.)?[Tt][Rr][Aa][Ss][Hh][Ee][Ss]";
  private static final String FS_EVENTS_FILE = "\\.[Ff][Ss][Ee][Vv][Ee][Nn][Tt][Ss][Dd]";
  private static final String OS_FILES_REGEX = ".*data/(" + THUMBS_DB_FILE + "|" + DS_STORE_FILE + "|" + SPOTLIGHT_FILE + "|" + TRASHES_FILE + "|" + FS_EVENTS_FILE + ")";
  private static final Version VERSION_1_0 = new Version(1,0);
  
  private ManifestChecker(){
    //intentionally left empty
  }
  
  /**
   * Check for all the manifest specific potential problems
   * 
   * @param version the version of the bag we are checking
   * @param bagitDir the directory where the manifests are stored
   * @param encoding the encoding of the manifests
   * @param warnings the set of warnings that will be appended to while checking
   * @param warningsToIgnore the set of warnings to ignore
   * 
   * @throws IOException if there is a problem reading a file (because it doesn't exist) 
   * @throws InvalidBagitFileFormatException if one (or more) of the files does not match the formatting as specified in the specification
   * @throws MaliciousPathException if someone crafted the bag to specifically try and write outside the bag directory
   * @throws UnsupportedAlgorithmException if a manifest uses an algorithm that the computer doesn't know how to use
   */
  public static void checkManifests(final Version version, final Path bagitDir, final Charset encoding, final Set<BagitWarning> warnings, 
      final Collection<BagitWarning> warningsToIgnore) throws IOException, InvalidBagitFileFormatException, MaliciousPathException, UnsupportedAlgorithmException{
        
    boolean missingTagManifest = true;
    final List<Path> payloadManifests = new ArrayList<>();
    final List<Path> tagManifests = new ArrayList<>();
    try(final DirectoryStream<Path> files = Files.newDirectoryStream(bagitDir)){
      for(final Path file : files){
        missingTagManifest = missingTagManifest && checkManifest(file, payloadManifests, tagManifests, encoding, warnings, warningsToIgnore);
      }
    }
    
    if(!warnings.contains(BagitWarning.MANIFEST_SETS_DIFFER)){
      checkManifestSets(version, tagManifests, payloadManifests, warnings, encoding);
    }
    
    if(!warningsToIgnore.contains(BagitWarning.MISSING_TAG_MANIFEST) && missingTagManifest){
      logger.warn(messages.getString("bag_missing_tag_manifest_warning"), bagitDir);
      warnings.add(BagitWarning.MISSING_TAG_MANIFEST);
    }
  }
  
  private static boolean checkManifest(final Path file, final List<Path> payloadManifests, final List<Path> tagManifests, 
      final Charset encoding, final Set<BagitWarning> warnings, 
      final Collection<BagitWarning> warningsToIgnore) throws IOException, InvalidBagitFileFormatException{
    boolean missingTagManifest = true;
    final String filename = PathUtils.getFilename(file);
    if(filename.contains("manifest-")){
      if(filename.startsWith("manifest-")){
        payloadManifests.add(file);
        checkManifestPayload(file, encoding, warnings, warningsToIgnore, true);
      }
      else{
        tagManifests.add(file);
        checkManifestPayload(file, encoding, warnings, warningsToIgnore, false);
        missingTagManifest = false;
      }
      
      final String algorithm = filename.split("[-\\.]")[1];
      checkAlgorthm(algorithm, warnings, warningsToIgnore);
    }
    
    return missingTagManifest;
  }
  
  /*
   * Check for a "bag within a bag", relative paths, and OS specific files in the manifests
   */
  private static void checkManifestPayload(final Path manifestFile, final Charset encoding, final Set<BagitWarning> warnings, 
      final Collection<BagitWarning> warningsToIgnore, final boolean isPayloadManifest) 
          throws IOException, InvalidBagitFileFormatException{
    
    try(final BufferedReader reader = Files.newBufferedReader(manifestFile, encoding)){
      final Set<String> paths = new HashSet<>();
      
      String line = reader.readLine();
      while(line != null){
        String path = parsePath(line);
        
        path = checkForManifestCreatedWithMD5SumTools(path, warnings, warningsToIgnore);
        
        checkForDifferentCase(path, paths, manifestFile, warnings, warningsToIgnore);
        paths.add(path.toLowerCase());
        
        if(encoding.name().startsWith("UTF")){
          checkNormalization(path, manifestFile.getParent(), warnings, warningsToIgnore);
        }
        checkForBagWithinBag(line, warnings, warningsToIgnore, isPayloadManifest);
        checkForRelativePaths(line, warnings, warningsToIgnore, manifestFile);
        checkForOSSpecificFiles(line, warnings, warningsToIgnore, manifestFile);
        
        line = reader.readLine();
      }
    }
  }
  
  /*
   * Check to make sure it conforms to <hash> <path>
   */
  static String parsePath(final String line) throws InvalidBagitFileFormatException{
    final String[] parts = line.split("\\s+", 2);
    if(parts.length < 2){
      final String formattedMessage = messages.getString("manifest_line_violated_spec_error");
      throw new InvalidBagitFileFormatException(MessageFormatter.format(formattedMessage, line).getMessage());
    }
    
    return parts[1];
  }
  
  /*
   * We allow for MD5sum tools for compatibility but it is not recommended
   */
  private static String checkForManifestCreatedWithMD5SumTools(final String path, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore){
    String fixedPath = path;
    final boolean startsWithStar = path.charAt(0) == '*';
    
    if(startsWithStar){
      fixedPath = path.substring(1);
    }
    
    if(!warningsToIgnore.contains(BagitWarning.MD5SUM_TOOL_GENERATED_MANIFEST) && startsWithStar){
      logger.warn(messages.getString("md5sum_generated_line_warning"), path);
      warnings.add(BagitWarning.MD5SUM_TOOL_GENERATED_MANIFEST);
    }
    
    return fixedPath;
  }
  
  /*
   * Check that the same line doesn't already exist in the set of paths
   */
  private static void checkForDifferentCase(final String path, final Set<String> paths, final Path manifestFile, 
      final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore){
    if(!warningsToIgnore.contains(BagitWarning.DIFFERENT_CASE) && paths.contains(path.toLowerCase())){
      logger.warn(messages.getString("different_case_warning"), manifestFile, path);
      warnings.add(BagitWarning.DIFFERENT_CASE);
    }
  }
  
  /*
   * Check that the file specified has not changed its normalization (i.e. have the bytes changed but it still looks the same?)
   */
  private static void checkNormalization(final String path, final Path rootDir, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore) throws IOException{
    if(!warningsToIgnore.contains(BagitWarning.DIFFERENT_NORMALIZATION)){
      
      final Path fileToCheck = rootDir.resolve(path).normalize();
      final Path dirToCheck = fileToCheck.getParent();
      if(dirToCheck == null){ 
        final String formattedMessage = messages.getString("cannot_access_parent_path_error");
        throw new IOException(MessageFormatter.format(formattedMessage, fileToCheck).getMessage()); //to satisfy findbugs
      }
      final String normalizedFileToCheck = normalizePathToNFD(fileToCheck);
      
      try(final DirectoryStream<Path> files = Files.newDirectoryStream(dirToCheck)){
        for(final Path file : files){
          final String normalizedFile = normalizePathToNFD(file);
          
          if(!file.equals(fileToCheck) && normalizedFileToCheck.equals(normalizedFile)){
            logger.warn(messages.getString("different_normalization_in_manifest_warning"), fileToCheck);
            warnings.add(BagitWarning.DIFFERENT_NORMALIZATION);
          }
        }
      }
    }
  }
  
  /*
   * Normalize to Canonical decomposition.
   */
  static String normalizePathToNFD(final Path path){
    return Normalizer.normalize(path.toString(), Normalizer.Form.NFD);
  }
  
  /*
   * check for a bag within a bag
   */
  private static void checkForBagWithinBag(final String line, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore, final boolean isPayloadManifest){
    if(!warningsToIgnore.contains(BagitWarning.BAG_WITHIN_A_BAG) && isPayloadManifest && line.contains("manifest-")){
      logger.warn(messages.getString("bag_within_bag_warning"));
      warnings.add(BagitWarning.BAG_WITHIN_A_BAG);
    }
  }
  
  /*
   * Check for relative paths (i.e. ./) in the manifest
   */
  private static void checkForRelativePaths(final String line, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore, final Path manifestFile){
    if(!warningsToIgnore.contains(BagitWarning.LEADING_DOT_SLASH) && line.contains("./")){
      logger.warn(messages.getString("leading_dot_slash_warning"), manifestFile, line);
      warnings.add(BagitWarning.LEADING_DOT_SLASH);
    }
  }
  
  /*
   * like .DS_Store or Thumbs.db
   */
  private static void checkForOSSpecificFiles(final String line, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore, final Path manifestFile){
    if(!warningsToIgnore.contains(BagitWarning.OS_SPECIFIC_FILES) && line.matches(OS_FILES_REGEX)){
      logger.warn(messages.getString("os_specific_files_warning"), manifestFile, line);
      warnings.add(BagitWarning.OS_SPECIFIC_FILES);
    }
  }
  
  /*
   * Check for anything weaker than SHA-512
   */
  static void checkAlgorthm(final String algorithm, final Set<BagitWarning> warnings, final Collection<BagitWarning> warningsToIgnore){
    final String upperCaseAlg = algorithm.toUpperCase();
    if(!warningsToIgnore.contains(BagitWarning.WEAK_CHECKSUM_ALGORITHM) && 
        (upperCaseAlg.startsWith("MD") || upperCaseAlg.matches("SHA(1|224|256|384)?"))){
      logger.warn(messages.getString("weak_algorithm_warning"), algorithm);
      warnings.add(BagitWarning.WEAK_CHECKSUM_ALGORITHM);
    }
    
    else if(!warningsToIgnore.contains(BagitWarning.NON_STANDARD_ALGORITHM) && !"SHA512".equals(upperCaseAlg)){
      logger.warn(messages.getString("non_standard_algorithm_warning"), algorithm);
      warnings.add(BagitWarning.NON_STANDARD_ALGORITHM);
    }
  }
  
  static void checkManifestSets(final Version version, final List<Path> tagManifests, final List<Path> payloadManifests, 
      final Set<BagitWarning> warnings, final Charset encoding) 
          throws IOException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException{
  //edge case, for version 1.0+ all tag manifests SHOULD list the same set of files
    if(tagManifests.size() > 1 && VERSION_1_0.isSameOrOlder(version)){
      checkManifestsListSameSetOfFiles(warnings, tagManifests, encoding);
    }
    
    //edge case, for version 1.0+ all payload manifests SHOULD list the same set of files
    if(payloadManifests.size() > 1 && VERSION_1_0.isSameOrOlder(version)){
      checkManifestsListSameSetOfFiles(warnings, payloadManifests, encoding);
    }
  }
  
  //starting with version 1.0 all manifest types (tag, payload) should list the same set of files
  @SuppressWarnings("PMD.EmptyCatchBlock")
  static void checkManifestsListSameSetOfFiles(final Set<BagitWarning> warnings, final List<Path> manifestPaths, final Charset charset) throws IOException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException{
    final StandardBagitAlgorithmNameToSupportedAlgorithmMapping nameMapping = new StandardBagitAlgorithmNameToSupportedAlgorithmMapping();
    
    Manifest compareToManifest = null;
    Path compareToManifestPath = null;
    for (final Path manifestPath : manifestPaths) {
      try {
        final Manifest manifest = ManifestReader.readManifest(nameMapping, manifestPath, manifestPath.getParent(), charset);
        if(compareToManifest == null) {
          compareToManifestPath = manifestPath;
          compareToManifest = manifest;
          continue;
        }
        
        if(!compareToManifest.getFileToChecksumMap().keySet().equals(manifest.getFileToChecksumMap().keySet())) {
          logger.warn(messages.getString("manifest_fileset_differ"), compareToManifestPath, manifestPath);
          warnings.add(BagitWarning.MANIFEST_SETS_DIFFER);
        }
      }
      catch(UnsupportedAlgorithmException e) {
        //ignore an unsupported algorithm as it is caught in checkAlgorthm()
      }
    }
  }

  //for unit test only
  static String getOsFilesRegex() {
    return OS_FILES_REGEX;
  }

}