ManifestVerifier.java
/*
* Copyright (C) 2023 DANS - Data Archiving and Networked Services (info@dans.knaw.nl)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.knaw.dans.bagit.verify;
import java.io.IOException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.ResourceBundle;
import java.util.Set;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import nl.knaw.dans.bagit.domain.Bag;
import nl.knaw.dans.bagit.domain.FetchItem;
import nl.knaw.dans.bagit.exceptions.FileNotInPayloadDirectoryException;
import nl.knaw.dans.bagit.exceptions.InvalidBagitFileFormatException;
import nl.knaw.dans.bagit.exceptions.MaliciousPathException;
import nl.knaw.dans.bagit.exceptions.UnsupportedAlgorithmException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.helpers.MessageFormatter;
import nl.knaw.dans.bagit.domain.Manifest;
import nl.knaw.dans.bagit.domain.Version;
import nl.knaw.dans.bagit.hash.BagitAlgorithmNameToSupportedAlgorithmMapping;
import nl.knaw.dans.bagit.hash.StandardBagitAlgorithmNameToSupportedAlgorithmMapping;
import nl.knaw.dans.bagit.util.PathUtils;
/**
* Responsible for all things related to the manifest during verification.
*/
public class ManifestVerifier implements AutoCloseable{
private static final Logger logger = LoggerFactory.getLogger(ManifestVerifier.class);
private static final ResourceBundle messages = ResourceBundle.getBundle("MessageBundle");
private transient final BagitAlgorithmNameToSupportedAlgorithmMapping nameMapping;
private transient final ExecutorService executor;
/**
* Create a PayloadVerifier using a cached thread pool and the
* {@link StandardBagitAlgorithmNameToSupportedAlgorithmMapping} mapping
*/
public ManifestVerifier(){
this(new StandardBagitAlgorithmNameToSupportedAlgorithmMapping(), Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()));
}
/**
* Create a PayloadVerifier using a cached thread pool and a custom mapping
*
* @param nameMapping the mapping between BagIt algorithm name and the java supported algorithm
*/
public ManifestVerifier(final BagitAlgorithmNameToSupportedAlgorithmMapping nameMapping) {
this(nameMapping, Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors()));
}
/**
* Create a PayloadVerifier using a custom thread pool and the
* {@link StandardBagitAlgorithmNameToSupportedAlgorithmMapping} mapping
*
* @param executor the thread pool to use when doing work
*/
public ManifestVerifier(final ExecutorService executor) {
this(new StandardBagitAlgorithmNameToSupportedAlgorithmMapping(), executor);
}
/**
* Create a PayloadVerifier using a custom thread pool and a custom mapping
*
* @param nameMapping the mapping between BagIt algorithm name and the java supported algorithm
* @param executor the thread pool to use when doing work
*/
public ManifestVerifier(final BagitAlgorithmNameToSupportedAlgorithmMapping nameMapping, final ExecutorService executor) {
this.nameMapping = nameMapping;
this.executor = executor;
}
@Override
public void close() throws SecurityException{
//shutdown the thread pool so the resource isn't leaked
executor.shutdown();
}
/**
* Verify that all the files in the payload directory are listed in the payload manifest and
* all files listed in all manifests exist.
*
* @param bag the bag to check to check
* @param ignoreHiddenFiles to ignore hidden files unless they are specifically listed in a manifest
*
* @throws IOException if there is a problem reading a file
* @throws MaliciousPathException the path in the manifest was specifically crafted to cause harm
* @throws UnsupportedAlgorithmException if the algorithm used for the manifest is unsupported
* @throws InvalidBagitFileFormatException if any of the manifests don't conform to the bagit specification
* @throws FileNotInPayloadDirectoryException if a file is listed in a manifest but doesn't exist in the payload directory
* @throws InterruptedException if a thread is interrupted while doing work
*/
/*
* Verify that all the files in the payload directory are listed in the payload manifest and
* all files listed in all manifests exist.
*
* @param bag the bag to check to check
* @param ignoreHiddenFiles to ignore hidden files unless they are specifically listed in a manifest
*
* @throws IOException if there is a problem reading a file
* @throws MaliciousPathException the path in the manifest was specifically crafted to cause harm
* @throws UnsupportedAlgorithmException if the algorithm used for the manifest is unsupported
* @throws InvalidBagitFileFormatException if any of the manifests don't conform to the bagit specification
* @throws FileNotInPayloadDirectoryException if a file is listed in a manifest but doesn't exist in the payload directory
* @throws InterruptedException if a thread is interrupted while doing work
*/
public void verifyManifests(final Bag bag, final boolean ignoreHiddenFiles)
throws IOException, MaliciousPathException, UnsupportedAlgorithmException,
InvalidBagitFileFormatException, FileNotInPayloadDirectoryException, InterruptedException {
verifyManifests(bag, ignoreHiddenFiles, false);
}
public void verifyManifests(final Bag bag, final boolean ignoreHiddenFiles, final boolean holey)
throws IOException, MaliciousPathException, UnsupportedAlgorithmException,
InvalidBagitFileFormatException, FileNotInPayloadDirectoryException, InterruptedException {
checkAlgorithms(bag.getPayLoadManifests());
checkAlgorithms(bag.getTagManifests());
final Set<Path> payloadFiles = getFilesListedInPayloadManifests(bag);
final Set<Path> tagFiles = getFilesListedInTagManifests(bag);
checkAllFilesListedInManifestExist(payloadFiles, holey, bag);
checkAllFilesListedInManifestExist(tagFiles, false, bag);
final Set<Path> allFilesListedInManifests = new HashSet<>(payloadFiles);
allFilesListedInManifests.addAll(tagFiles);
if (bag.getVersion().isOlder(new Version(1, 0))) {
checkAllFilesInPayloadDirAreListedInAtLeastOneAManifest(allFilesListedInManifests, PathUtils.getDataDir(bag), ignoreHiddenFiles);
} else {
checkAllFilesInPayloadDirAreListedInAllManifests(bag.getPayLoadManifests(), PathUtils.getDataDir(bag), ignoreHiddenFiles);
}
}
private void checkAlgorithms(final Set<Manifest> manifests) throws UnsupportedAlgorithmException {
for (final Manifest manifest : manifests) {
if (nameMapping.getSupportedAlgorithm(manifest.getAlgorithm().getBagitName()) == null) {
throw new UnsupportedAlgorithmException(messages.getString("unsupported_algorithm_error"), manifest.getAlgorithm().getBagitName(), null);
}
}
}
/*
* get all the files listed in the payload manifests
*/
private Set<Path> getFilesListedInPayloadManifests(final Bag bag)
throws IOException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException {
logger.debug(messages.getString("all_files_in_manifests"));
final Set<Path> filesListedInManifests = new HashSet<>();
for (final Manifest manifest : bag.getPayLoadManifests()) {
filesListedInManifests.addAll(manifest.getFileToChecksumMap().keySet());
}
return filesListedInManifests;
}
/*
* get all the files listed in the tag manifests
*/
private Set<Path> getFilesListedInTagManifests(final Bag bag)
throws IOException, MaliciousPathException, UnsupportedAlgorithmException, InvalidBagitFileFormatException {
logger.debug(messages.getString("all_files_in_manifests"));
final Set<Path> filesListedInManifests = new HashSet<>();
for (final Manifest manifest : bag.getTagManifests()) {
filesListedInManifests.addAll(manifest.getFileToChecksumMap().keySet());
}
return filesListedInManifests;
}
/*
* Make sure all the listed files actually exist
*/
@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
private void checkAllFilesListedInManifestExist(final Set<Path> files, final boolean holey, final Bag bag) throws FileNotInPayloadDirectoryException, InterruptedException {
final CountDownLatch latch = new CountDownLatch(files.size());
final Set<Path> missingFiles = new ConcurrentSkipListSet<>();
final Map<Path, URL> fetchUrls = new HashMap<>();
if (holey) {
for (final FetchItem item : bag.getItemsToFetch()) {
fetchUrls.put(item.path, item.url);
}
}
logger.info(messages.getString("check_all_files_in_manifests_exist"));
for (final Path file : files) {
if (holey && fetchUrls.containsKey(file)) {
// Not actually checking that the file can be downloaded. That will be done when calculating the checksums later on.
latch.countDown();
} else {
executor.execute(new CheckIfFileExistsTask(file, missingFiles, latch));
}
}
latch.await();
if (!missingFiles.isEmpty()) {
final String formattedMessage = messages.getString("missing_payload_files_error");
throw new FileNotInPayloadDirectoryException(MessageFormatter.format(formattedMessage, missingFiles).getMessage());
}
}
/*
* Make sure all the listed files actually exist
*/
@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
private void checkAllFilesListedInManifestExist(final Set<Path> files) throws FileNotInPayloadDirectoryException, InterruptedException {
checkAllFilesListedInManifestExist(files, false, null);
}
/*
* Make sure all files in the directory are in at least 1 manifest
*/
private static void checkAllFilesInPayloadDirAreListedInAtLeastOneAManifest(final Set<Path> filesListedInManifests,
final Path payloadDir, final boolean ignoreHiddenFiles) throws IOException {
logger.debug(messages.getString("checking_file_in_at_least_one_manifest"), payloadDir);
if (Files.exists(payloadDir)) {
Files.walkFileTree(payloadDir,
new PayloadFileExistsInAtLeastOneManifestVistor(filesListedInManifests, ignoreHiddenFiles));
}
}
/*
* as per the bagit-spec 1.0+ all files have to be listed in all manifests
*/
private static void checkAllFilesInPayloadDirAreListedInAllManifests(final Set<Manifest> payLoadManifests,
final Path payloadDir, final boolean ignoreHiddenFiles) throws IOException {
logger.debug(messages.getString("checking_file_in_all_manifests"), payloadDir);
if (Files.exists(payloadDir)) {
Files.walkFileTree(payloadDir, new PayloadFileExistsInAllManifestsVistor(payLoadManifests, ignoreHiddenFiles));
}
}
}