001/* 002 * Syncany, www.syncany.org 003 * Copyright (C) 2011-2016 Philipp C. Heckel <philipp.heckel@gmail.com> 004 * 005 * This program is free software: you can redistribute it and/or modify 006 * it under the terms of the GNU General Public License as published by 007 * the Free Software Foundation, either version 3 of the License, or 008 * (at your option) any later version. 009 * 010 * This program is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU General Public License for more details. 014 * 015 * You should have received a copy of the GNU General Public License 016 * along with this program. If not, see <http://www.gnu.org/licenses/>. 017 */ 018package org.syncany.chunk; 019 020import java.io.File; 021import java.io.FileOutputStream; 022import java.io.IOException; 023import java.util.List; 024 025import org.syncany.chunk.Chunker.ChunkEnumeration; 026import org.syncany.database.MultiChunkEntry.MultiChunkId; 027 028/** 029 * The Deduper implements the core deduplication algorithm used by Syncany. 030 * 031 * <p>The algorithm uses a {@link Chunker} to break files into individual 032 * {@link Chunk}s. These chunks are added to a {@link MultiChunk} using an implementation 033 * of a {@link MultiChunker}. Before this multichunk is written to a file, it is transformed 034 * using one or many {@link Transformer}s (can be chained). 035 * 036 * <p>This class does not maintain a chunk index itself. Instead, it calls a listener to 037 * lookup a chunk, and skips further chunk processing if the chunk already exists. 038 * 039 * <p>For a detailed description of the algorithm, please refer to chapter 5.3 of the thesis: 040 * <i>"Minimizing remote storage usage and synchronization time using deduplication and 041 * multichunking: Syncany as an example"</i> 042 * 043 * @see <a href="http://blog.philippheckel.com/2013/05/20/minimizing-remote-storage-usage-and-synchronization-time-using-deduplication-and-multichunking-syncany-as-an-example/">Blog post: Minimizing remote storage usage and synchronization time using deduplication and multichunking: Syncany as an example</a> 044 * @author Philipp C. Heckel (philipp.heckel@gmail.com) 045 */ 046public class Deduper { 047 private Chunker chunker; 048 private MultiChunker multiChunker; 049 private Transformer transformer; 050 private long maxTotalSize; 051 private long maxNumberOfFiles; 052 053 public Deduper(Chunker chunker, MultiChunker multiChunker, Transformer transformer, long maxTotalSize, long maxNumberOfFiles) { 054 this.chunker = chunker; 055 this.multiChunker = multiChunker; 056 this.transformer = transformer; 057 this.maxTotalSize = maxTotalSize; 058 this.maxNumberOfFiles = maxNumberOfFiles; 059 } 060 061 /** 062 * Deduplicates the given list of files according to the Syncany chunk algorithm. 063 * 064 * <p>A brief description of the algorithm (and further links to a detailed description) 065 * are given in the {@link Deduper}. 066 * 067 * @param files List of files to be deduplicated (will be modified!) 068 * @param listener Listener to react of file/chunk/multichunk events, and to implement the chunk index 069 * @throws IOException If a file cannot be read or an unexpected exception occurs 070 */ 071 public void deduplicate(List<File> files, DeduperListener listener) throws IOException { 072 Chunk chunk = null; 073 MultiChunk multiChunk = null; 074 long totalMultiChunkSize = 0L; 075 long totalNumFiles = 0L; 076 077 while (!files.isEmpty()) { 078 File file = files.remove(0); 079 totalNumFiles++; 080 081 // Filter ignored files 082 boolean fileAccepted = listener.onFileFilter(file); 083 084 if (!fileAccepted) { 085 continue; 086 } 087 088 // Decide whether to index the contents 089 boolean dedupContents = listener.onFileStart(file); 090 091 if (dedupContents) { 092 // Create chunks from file 093 ChunkEnumeration chunksEnum = chunker.createChunks(file); 094 095 while (chunksEnum.hasMoreElements()) { 096 chunk = chunksEnum.nextElement(); 097 098 // old chunk 099 if (!listener.onChunk(chunk)) { 100 listener.onFileAddChunk(file, chunk); 101 continue; 102 } 103 104 // new chunk 105 else { 106 // - Check if multichunk full 107 if (multiChunk != null && multiChunk.isFull()) { 108 totalMultiChunkSize += multiChunk.getSize(); 109 multiChunk.close(); 110 listener.onMultiChunkClose(multiChunk); 111 112 multiChunk = null; 113 } 114 115 // - Open new multichunk if non-existent 116 if (multiChunk == null) { 117 MultiChunkId newMultiChunkId = listener.createNewMultiChunkId(chunk); 118 File multiChunkFile = listener.getMultiChunkFile(newMultiChunkId); 119 120 multiChunk = multiChunker.createMultiChunk(newMultiChunkId, 121 transformer.createOutputStream(new FileOutputStream(multiChunkFile))); 122 123 listener.onMultiChunkOpen(multiChunk); 124 } 125 126 // - Add chunk data 127 multiChunk.write(chunk); 128 listener.onMultiChunkWrite(multiChunk, chunk); 129 } 130 131 listener.onFileAddChunk(file, chunk); 132 } 133 134 // Closing file is necessary! 135 chunksEnum.close(); 136 137 } 138 139 if (chunk != null) { 140 listener.onFileEnd(file, chunk.getFileChecksum()); 141 } 142 else { 143 listener.onFileEnd(file, null); 144 } 145 146 // Reset chunk (if folder after chunk, the folder would have a checksum b/c of chunk.getFileChecksum()) 147 chunk = null; 148 149 // Check if we have reached the transaction limit 150 if (multiChunk != null) { 151 if (totalMultiChunkSize + multiChunk.getSize() >= maxTotalSize || totalNumFiles >= maxNumberOfFiles) { 152 multiChunk.close(); 153 listener.onMultiChunkClose(multiChunk); 154 return; 155 } 156 } 157 else if (totalMultiChunkSize >= maxTotalSize || totalNumFiles >= maxNumberOfFiles) { 158 return; 159 } 160 } 161 162 // Close and add last multichunk 163 if (multiChunk != null) { 164 // Data 165 multiChunk.close(); 166 listener.onMultiChunkClose(multiChunk); 167 168 multiChunk = null; 169 } 170 171 listener.onFinish(); 172 173 return; 174 } 175}