001/*
002 * Syncany, www.syncany.org
003 * Copyright (C) 2011-2016 Philipp C. Heckel <philipp.heckel@gmail.com>
004 *
005 * This program is free software: you can redistribute it and/or modify
006 * it under the terms of the GNU General Public License as published by
007 * the Free Software Foundation, either version 3 of the License, or
008 * (at your option) any later version.
009 *
010 * This program is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013 * GNU General Public License for more details.
014 *
015 * You should have received a copy of the GNU General Public License
016 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
017 */
018package org.syncany.chunk;
019
020import java.io.File;
021import java.io.FileInputStream;
022import java.io.IOException;
023import java.io.InputStream;
024import java.security.MessageDigest;
025import java.util.logging.Level;
026import java.util.logging.Logger;
027
028/**
029 * The TTTD chunker is an implementation of the Two Threshold Two Divisor (TTTD)
030 * chunking method based on the paper of Kave Eshghi and Hsiu Khuern Tang, 2005.
031 *
032 * <p>The class implements a content-based {@link Chunker}, i.e. it determines breakpoints
033 * on the content rather than on the offset. It uses a fingerprinting algorithm to
034 * calculate window fingerprints and determine a breakpoint. The algorithm given in the
035 * constructor is instantiated to an implementation of a {@link Fingerprinter}.
036 *
037 * <p>The TTTD chunking method makes sure that it does not produce chunks smaller
038 * than a certain threshold. To do so, it ignores chunk boundaries until a minimum
039 * size is reached. Even though this negatively affects the duplicate detection
040 * (because bigger chunks are created), chunk boundaries are still "natural" --
041 * because they are based on the underlying data.
042 *
043 * <p>To handle chunks that exceed a certain maximum size, TTTD applies two techniques.
044 * It defines the two divisors <i>D</i> (regular divisor) and <i>D'</i> (backup divisor)
045 * with <i>D</i> &gt; <i>D'</i>. Because <i>D'</i> is smaller than <i>D</i>, it is more
046 * likely to find a breakpoint.
047 *
048 * <p>If <i>D</i> does not find a chunk boundary before the maximum chunk size is reached,
049 * the backup breakpoint found by <i>D'</i> is used. If <i>D</i> also does not find any
050 * breakpoints, TTTD simply cuts the chunk at the maximum chunk size. TTTD hence guarantees
051 * to emit chunks with a minimum and maximum size.
052 *
053 * @author Philipp C. Heckel (philipp.heckel@gmail.com)
054 * @see <a href="http://www.hpl.hp.com/techreports/2005/HPL-2005-30R1.html">Original TTTD paper: A framework for analyzing and improving content-based chunking algorithms (2005, Kave Eshghi and Hsiu Khuern Tang)</a>
055 */
056public class TttdChunker extends Chunker {
057        private static final Logger logger = Logger.getLogger(TttdChunker.class.getSimpleName());
058
059        public static final int DEFAULT_WINDOW_SIZE = 48; // like LBFS
060        public static final String DEFAULT_DIGEST_ALG = "SHA1";
061        public static final String DEFAULT_FINGERPRINT_ALG = "Adler32";
062
063        private int Tmin;
064        private int Tmax;
065        private int D;
066        private int Ddash;
067        private int windowSize;
068        private String checksumAlgorithm;
069        private String fingerprintAlgorithm;
070        private String name;
071
072        public TttdChunker(int Tmin, int Tmax, int D, int Ddash, int windowSize) {
073                this(Tmin, Tmax, D, Ddash, windowSize, DEFAULT_DIGEST_ALG, DEFAULT_FINGERPRINT_ALG);
074        }
075
076        public TttdChunker(int Tmin, int Tmax, int D, int Ddash, int windowSize, String digestAlg) {
077                this(Tmin, Tmax, D, Ddash, windowSize, digestAlg, DEFAULT_FINGERPRINT_ALG);
078        }
079
080        public TttdChunker(int avgChunkSize) {
081                this(avgChunkSize, DEFAULT_WINDOW_SIZE, DEFAULT_DIGEST_ALG, DEFAULT_FINGERPRINT_ALG);
082        }
083
084        /**
085         * Infer the optimal values for avgChunkSize from the orginal paper's optimal (measured) values.
086         * LBFS: avg. chunk size = 1015 bytes --&gt; Tmin = 460, Tmax = 2800, D = 540, Ddash = 270
087         */
088        public TttdChunker(int avgChunkSize, int windowSize, String digestAlg, String fingerprintAlg) {
089                this(
090                                /* Tmin */(int) Math.round(460.0 * avgChunkSize / 1015.0),
091                                /* Tmax */(int) Math.round(2800.0 * avgChunkSize / 1015.0),
092                                /*   D  */(int) Math.round(540.0 * avgChunkSize / 1015.0),
093                                /*   D  */(int) Math.round(270.0 * avgChunkSize / 1015.0),
094                                /* rest */windowSize, digestAlg, fingerprintAlg, "TTTD-" + avgChunkSize + "-" + digestAlg + "-" + fingerprintAlg);
095        }
096
097        public TttdChunker(int Tmin, int Tmax, int D, int Ddash, int windowSize, String digestAlg, String fingerprintAlg) {
098                this(Tmin, Tmax, D, Ddash, windowSize, digestAlg, fingerprintAlg, "TTTD-" + Tmin + "-" + Tmax + "-" + D + "-" + Ddash + "-" + digestAlg + "-"
099                                + fingerprintAlg);
100        }
101
102        private TttdChunker(int Tmin, int Tmax, int D, int Ddash, int windowSize, String digestAlg, String fingerprintAlg, String name) {
103                this.Tmin = Tmin;
104                this.Tmax = Tmax;
105                this.D = D;
106                this.Ddash = Ddash;
107                this.windowSize = windowSize;
108                this.checksumAlgorithm = digestAlg;
109                this.fingerprintAlgorithm = fingerprintAlg;
110                this.name = name;
111
112                if (windowSize > Tmin) {
113                        throw new IllegalArgumentException("Window size must be smaller than Tmin.");
114                }
115        }
116
117        @Override
118        public ChunkEnumeration createChunks(File file) throws IOException {
119                return new TTTDEnumeration(new FileInputStream(file));
120        }
121
122        @Override
123        public String getChecksumAlgorithm() {
124                return checksumAlgorithm;
125        }
126
127        @Override
128        public String toString() {
129                return name;
130        }
131
132        public class TTTDEnumeration implements ChunkEnumeration {
133                private InputStream in;
134                private boolean closed;
135                private byte[] c;
136                private int clen;
137                private int cpos;
138
139                private MessageDigest chunkDigest;
140                private MessageDigest fileDigest;
141                private Fingerprinter fingerprinter;
142
143                public TTTDEnumeration(InputStream in) throws IOException {
144                        this.in = in;
145                        this.closed = false;
146                        this.c = new byte[8192];
147                        this.clen = -1;
148                        this.cpos = -1;
149
150                        try {
151                                fingerprinter = Fingerprinter.getInstance(fingerprintAlgorithm);
152                                chunkDigest = MessageDigest.getInstance(checksumAlgorithm);
153                                fileDigest = MessageDigest.getInstance(checksumAlgorithm);
154
155                                fileDigest.reset();
156                        }
157                        catch (Exception e) {
158                                throw new RuntimeException(e);
159                        }
160                }
161
162                @Override
163                public boolean hasMoreElements() {
164                        return !closed;
165                }
166
167                @Override
168                public Chunk nextElement() {
169                        if (closed) {
170                                return null;
171                        }
172
173                        chunkDigest.reset();
174                        fingerprinter.reset();
175
176                        try {
177                                int backupBreak = 0;
178                                int breakpoint = -1;
179
180                                byte[] buf = new byte[Tmax];
181                                int bufpos = -1;
182
183                                while (bufpos < buf.length - 1) {
184                                        if (cpos == -1 || cpos == clen - 1) {
185                                                cpos = -1;
186                                                clen = readFromInputStreamFixed(c, in);//in.read(c);
187
188                                                if (clen == -1) {
189                                                        break;
190                                                }
191
192                                                fileDigest.update(c, 0, clen);
193                                        }
194
195                                        bufpos++;
196                                        cpos++;
197                                        buf[bufpos] = c[cpos];
198
199                                        if (bufpos < Tmin) {
200                                                continue;
201                                        }
202                                        else if (bufpos == Tmin) {
203                                                fingerprinter.check(buf, bufpos - windowSize, windowSize);
204                                        }
205                                        else {
206                                                fingerprinter.roll(buf[bufpos]);
207                                        }
208
209                                        int hash = fingerprinter.getValue();
210
211                                        // The value of r (right side) plays no role! #39  
212                                        if ((hash % Ddash) == Ddash - 1) {
213                                                backupBreak = bufpos;
214                                        }
215
216                                        if ((hash % D) == D - 1) {
217                                                breakpoint = bufpos;
218                                                break;
219                                        }
220
221                                        if (bufpos < Tmax) {
222                                                continue;
223                                        }
224
225                                        if (backupBreak != 0) {
226                                                breakpoint = backupBreak;
227                                                break;
228                                        }
229                                        else {
230                                                breakpoint = bufpos;
231                                                break;
232                                        }
233                                }
234
235                                // Close if this was the last bytes
236                                if (clen == -1) {
237                                        in.close();
238                                        closed = true;
239                                }
240
241                                // EOF as breakpoint
242                                if (breakpoint == -1) {
243                                        breakpoint = bufpos;
244                                }
245
246                                // Increase breakpoint
247                                breakpoint++;
248
249                                // Create chunk
250                                chunkDigest.update(buf, 0, breakpoint);
251
252                                byte[] chunkChecksum = chunkDigest.digest();
253                                byte[] chunkContents = buf;
254                                int chunkSize = breakpoint;
255                                byte[] fileChecksum = (clen == -1) ? fileDigest.digest() : null;
256
257                                return new Chunk(chunkChecksum, chunkContents, chunkSize, fileChecksum);
258                        }
259                        catch (IOException ex) {
260                                logger.log(Level.SEVERE, "Error while retrieving next chunk.", ex);
261                                return null;
262                        }
263                }
264
265                @Override
266                public void close() {
267                        try {
268                                in.close();
269                        }
270                        catch (IOException e) {
271                                logger.log(Level.INFO, "Error while closing", e);
272                        }
273                }
274
275                /**
276                 * Fixes the read errors occurring with Cipher streams in the standard
277                 * Java read implementation.
278                 */
279                private int readFromInputStreamFixed(byte[] readToBuffer, InputStream inputStream) throws IOException {
280                        int bytesRead = 0;
281
282                        while (bytesRead < readToBuffer.length) {
283                                int byteRead = inputStream.read();
284
285                                if (byteRead == -1) {
286                                        return (bytesRead != 0) ? bytesRead : -1;
287                                }
288
289                                readToBuffer[bytesRead] = (byte) byteRead;
290                                bytesRead++;
291                        }
292
293                        return (bytesRead != 0) ? bytesRead : -1;
294                }
295        }
296}