001/*
002 * Syncany, www.syncany.org
003 * Copyright (C) 2011-2016 Philipp C. Heckel <philipp.heckel@gmail.com> 
004 *
005 * This program is free software: you can redistribute it and/or modify
006 * it under the terms of the GNU General Public License as published by
007 * the Free Software Foundation, either version 3 of the License, or
008 * (at your option) any later version.
009 *
010 * This program is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013 * GNU General Public License for more details.
014 *
015 * You should have received a copy of the GNU General Public License
016 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
017 */
018package org.syncany.chunk;
019
020import java.io.File;
021import java.io.FileOutputStream;
022import java.io.IOException;
023import java.util.List;
024
025import org.syncany.chunk.Chunker.ChunkEnumeration;
026import org.syncany.database.MultiChunkEntry.MultiChunkId;
027
028/**
029 * The Deduper implements the core deduplication algorithm used by Syncany. 
030 * 
031 * <p>The algorithm uses a {@link Chunker} to break files into individual
032 * {@link Chunk}s. These chunks are added to a {@link MultiChunk} using an implementation
033 * of a {@link MultiChunker}. Before this multichunk is written to a file, it is transformed
034 * using one or many {@link Transformer}s (can be chained). 
035 * 
036 * <p>This class does not maintain a chunk index itself. Instead, it calls a listener to
037 * lookup a chunk, and skips further chunk processing if the chunk already exists. 
038 * 
039 * <p>For a detailed description of the algorithm, please refer to chapter 5.3 of the thesis:
040 * <i>"Minimizing remote storage usage and synchronization time using deduplication and
041 * multichunking: Syncany as an example"</i>
042 * 
043 * @see <a href="http://blog.philippheckel.com/2013/05/20/minimizing-remote-storage-usage-and-synchronization-time-using-deduplication-and-multichunking-syncany-as-an-example/">Blog post: Minimizing remote storage usage and synchronization time using deduplication and multichunking: Syncany as an example</a>
044 * @author Philipp C. Heckel (philipp.heckel@gmail.com)
045 */
046public class Deduper {  
047        private Chunker chunker;
048        private MultiChunker multiChunker;
049        private Transformer transformer;
050        private long maxTotalSize;
051        private long maxNumberOfFiles;
052
053        public Deduper(Chunker chunker, MultiChunker multiChunker, Transformer transformer, long maxTotalSize, long maxNumberOfFiles) {
054                this.chunker = chunker;
055                this.multiChunker = multiChunker;
056                this.transformer = transformer;
057                this.maxTotalSize = maxTotalSize;
058                this.maxNumberOfFiles = maxNumberOfFiles;
059        }
060        
061        /**
062         * Deduplicates the given list of files according to the Syncany chunk algorithm. 
063         * 
064         * <p>A brief description of the algorithm (and further links to a detailed description)
065         * are given in the {@link Deduper}.
066         *      
067         * @param files List of files to be deduplicated (will be modified!)
068         * @param listener Listener to react of file/chunk/multichunk events, and to implement the chunk index
069         * @throws IOException If a file cannot be read or an unexpected exception occurs
070         */
071        public void deduplicate(List<File> files, DeduperListener listener) throws IOException {
072                Chunk chunk = null;
073                MultiChunk multiChunk = null;
074                long totalMultiChunkSize = 0L;
075                long totalNumFiles = 0L;
076                
077                while (!files.isEmpty()) {
078                        File file = files.remove(0);
079                        totalNumFiles++;
080                        
081                        // Filter ignored files
082                        boolean fileAccepted = listener.onFileFilter(file);
083                        
084                        if (!fileAccepted) {
085                                continue;
086                        }
087                        
088                        // Decide whether to index the contents
089                        boolean dedupContents = listener.onFileStart(file);
090
091                        if (dedupContents) {
092                                // Create chunks from file
093                                ChunkEnumeration chunksEnum = chunker.createChunks(file);
094
095                                while (chunksEnum.hasMoreElements()) {
096                                        chunk = chunksEnum.nextElement();
097
098                                        // old chunk
099                                        if (!listener.onChunk(chunk)) {
100                                                listener.onFileAddChunk(file, chunk);
101                                                continue;
102                                        }
103
104                                        // new chunk
105                                        else {                                  
106                                                // - Check if multichunk full
107                                                if (multiChunk != null && multiChunk.isFull()) {
108                                                        totalMultiChunkSize += multiChunk.getSize();
109                                                        multiChunk.close();
110                                                        listener.onMultiChunkClose(multiChunk);
111
112                                                        multiChunk = null;
113                                                }
114
115                                                // - Open new multichunk if non-existent
116                                                if (multiChunk == null) {
117                                                        MultiChunkId newMultiChunkId = listener.createNewMultiChunkId(chunk);
118                                                        File multiChunkFile = listener.getMultiChunkFile(newMultiChunkId);
119                                                        
120                                                        multiChunk = multiChunker.createMultiChunk(newMultiChunkId, 
121                                                                transformer.createOutputStream(new FileOutputStream(multiChunkFile)));
122
123                                                        listener.onMultiChunkOpen(multiChunk);
124                                                }
125
126                                                // - Add chunk data
127                                                multiChunk.write(chunk);                                                
128                                                listener.onMultiChunkWrite(multiChunk, chunk);                                          
129                                        }
130
131                                        listener.onFileAddChunk(file, chunk);                                                                           
132                                }
133
134                                // Closing file is necessary!
135                                chunksEnum.close();
136
137                        }
138
139                        if (chunk != null) {                    
140                                listener.onFileEnd(file, chunk.getFileChecksum());
141                        }
142                        else {
143                                listener.onFileEnd(file, null);
144                        }
145                        
146                        // Reset chunk (if folder after chunk, the folder would have a checksum b/c of chunk.getFileChecksum())
147                        chunk = null;
148
149                        // Check if we have reached the transaction limit
150                        if (multiChunk != null) {
151                                if (totalMultiChunkSize + multiChunk.getSize() >= maxTotalSize || totalNumFiles >= maxNumberOfFiles) {
152                                        multiChunk.close();
153                                        listener.onMultiChunkClose(multiChunk);
154                                        return;
155                                }
156                        }
157                        else if (totalMultiChunkSize >= maxTotalSize || totalNumFiles >= maxNumberOfFiles) {
158                                return;
159                        }
160                }
161
162                // Close and add last multichunk
163                if (multiChunk != null) {
164                        // Data
165                        multiChunk.close();
166                        listener.onMultiChunkClose(multiChunk);
167
168                        multiChunk = null;
169                }
170                
171                listener.onFinish();
172
173                return;
174        }       
175}