001/* 002 * Syncany, www.syncany.org 003 * Copyright (C) 2011-2016 Philipp C. Heckel <philipp.heckel@gmail.com> 004 * 005 * This program is free software: you can redistribute it and/or modify 006 * it under the terms of the GNU General Public License as published by 007 * the Free Software Foundation, either version 3 of the License, or 008 * (at your option) any later version. 009 * 010 * This program is distributed in the hope that it will be useful, 011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 013 * GNU General Public License for more details. 014 * 015 * You should have received a copy of the GNU General Public License 016 * along with this program. If not, see <http://www.gnu.org/licenses/>. 017 */ 018package org.syncany.chunk; 019 020import java.io.File; 021import java.io.FileInputStream; 022import java.io.IOException; 023import java.io.InputStream; 024import java.security.MessageDigest; 025import java.util.logging.Level; 026import java.util.logging.Logger; 027 028/** 029 * The TTTD chunker is an implementation of the Two Threshold Two Divisor (TTTD) 030 * chunking method based on the paper of Kave Eshghi and Hsiu Khuern Tang, 2005. 031 * 032 * <p>The class implements a content-based {@link Chunker}, i.e. it determines breakpoints 033 * on the content rather than on the offset. It uses a fingerprinting algorithm to 034 * calculate window fingerprints and determine a breakpoint. The algorithm given in the 035 * constructor is instantiated to an implementation of a {@link Fingerprinter}. 036 * 037 * <p>The TTTD chunking method makes sure that it does not produce chunks smaller 038 * than a certain threshold. To do so, it ignores chunk boundaries until a minimum 039 * size is reached. Even though this negatively affects the duplicate detection 040 * (because bigger chunks are created), chunk boundaries are still "natural" -- 041 * because they are based on the underlying data. 042 * 043 * <p>To handle chunks that exceed a certain maximum size, TTTD applies two techniques. 044 * It defines the two divisors <i>D</i> (regular divisor) and <i>D'</i> (backup divisor) 045 * with <i>D</i> > <i>D'</i>. Because <i>D'</i> is smaller than <i>D</i>, it is more 046 * likely to find a breakpoint. 047 * 048 * <p>If <i>D</i> does not find a chunk boundary before the maximum chunk size is reached, 049 * the backup breakpoint found by <i>D'</i> is used. If <i>D</i> also does not find any 050 * breakpoints, TTTD simply cuts the chunk at the maximum chunk size. TTTD hence guarantees 051 * to emit chunks with a minimum and maximum size. 052 * 053 * @author Philipp C. Heckel (philipp.heckel@gmail.com) 054 * @see <a href="http://www.hpl.hp.com/techreports/2005/HPL-2005-30R1.html">Original TTTD paper: A framework for analyzing and improving content-based chunking algorithms (2005, Kave Eshghi and Hsiu Khuern Tang)</a> 055 */ 056public class TttdChunker extends Chunker { 057 private static final Logger logger = Logger.getLogger(TttdChunker.class.getSimpleName()); 058 059 public static final int DEFAULT_WINDOW_SIZE = 48; // like LBFS 060 public static final String DEFAULT_DIGEST_ALG = "SHA1"; 061 public static final String DEFAULT_FINGERPRINT_ALG = "Adler32"; 062 063 private int Tmin; 064 private int Tmax; 065 private int D; 066 private int Ddash; 067 private int windowSize; 068 private String checksumAlgorithm; 069 private String fingerprintAlgorithm; 070 private String name; 071 072 public TttdChunker(int Tmin, int Tmax, int D, int Ddash, int windowSize) { 073 this(Tmin, Tmax, D, Ddash, windowSize, DEFAULT_DIGEST_ALG, DEFAULT_FINGERPRINT_ALG); 074 } 075 076 public TttdChunker(int Tmin, int Tmax, int D, int Ddash, int windowSize, String digestAlg) { 077 this(Tmin, Tmax, D, Ddash, windowSize, digestAlg, DEFAULT_FINGERPRINT_ALG); 078 } 079 080 public TttdChunker(int avgChunkSize) { 081 this(avgChunkSize, DEFAULT_WINDOW_SIZE, DEFAULT_DIGEST_ALG, DEFAULT_FINGERPRINT_ALG); 082 } 083 084 /** 085 * Infer the optimal values for avgChunkSize from the orginal paper's optimal (measured) values. 086 * LBFS: avg. chunk size = 1015 bytes --> Tmin = 460, Tmax = 2800, D = 540, Ddash = 270 087 */ 088 public TttdChunker(int avgChunkSize, int windowSize, String digestAlg, String fingerprintAlg) { 089 this( 090 /* Tmin */(int) Math.round(460.0 * avgChunkSize / 1015.0), 091 /* Tmax */(int) Math.round(2800.0 * avgChunkSize / 1015.0), 092 /* D */(int) Math.round(540.0 * avgChunkSize / 1015.0), 093 /* D */(int) Math.round(270.0 * avgChunkSize / 1015.0), 094 /* rest */windowSize, digestAlg, fingerprintAlg, "TTTD-" + avgChunkSize + "-" + digestAlg + "-" + fingerprintAlg); 095 } 096 097 public TttdChunker(int Tmin, int Tmax, int D, int Ddash, int windowSize, String digestAlg, String fingerprintAlg) { 098 this(Tmin, Tmax, D, Ddash, windowSize, digestAlg, fingerprintAlg, "TTTD-" + Tmin + "-" + Tmax + "-" + D + "-" + Ddash + "-" + digestAlg + "-" 099 + fingerprintAlg); 100 } 101 102 private TttdChunker(int Tmin, int Tmax, int D, int Ddash, int windowSize, String digestAlg, String fingerprintAlg, String name) { 103 this.Tmin = Tmin; 104 this.Tmax = Tmax; 105 this.D = D; 106 this.Ddash = Ddash; 107 this.windowSize = windowSize; 108 this.checksumAlgorithm = digestAlg; 109 this.fingerprintAlgorithm = fingerprintAlg; 110 this.name = name; 111 112 if (windowSize > Tmin) { 113 throw new IllegalArgumentException("Window size must be smaller than Tmin."); 114 } 115 } 116 117 @Override 118 public ChunkEnumeration createChunks(File file) throws IOException { 119 return new TTTDEnumeration(new FileInputStream(file)); 120 } 121 122 @Override 123 public String getChecksumAlgorithm() { 124 return checksumAlgorithm; 125 } 126 127 @Override 128 public String toString() { 129 return name; 130 } 131 132 public class TTTDEnumeration implements ChunkEnumeration { 133 private InputStream in; 134 private boolean closed; 135 private byte[] c; 136 private int clen; 137 private int cpos; 138 139 private MessageDigest chunkDigest; 140 private MessageDigest fileDigest; 141 private Fingerprinter fingerprinter; 142 143 public TTTDEnumeration(InputStream in) throws IOException { 144 this.in = in; 145 this.closed = false; 146 this.c = new byte[8192]; 147 this.clen = -1; 148 this.cpos = -1; 149 150 try { 151 fingerprinter = Fingerprinter.getInstance(fingerprintAlgorithm); 152 chunkDigest = MessageDigest.getInstance(checksumAlgorithm); 153 fileDigest = MessageDigest.getInstance(checksumAlgorithm); 154 155 fileDigest.reset(); 156 } 157 catch (Exception e) { 158 throw new RuntimeException(e); 159 } 160 } 161 162 @Override 163 public boolean hasMoreElements() { 164 return !closed; 165 } 166 167 @Override 168 public Chunk nextElement() { 169 if (closed) { 170 return null; 171 } 172 173 chunkDigest.reset(); 174 fingerprinter.reset(); 175 176 try { 177 int backupBreak = 0; 178 int breakpoint = -1; 179 180 byte[] buf = new byte[Tmax]; 181 int bufpos = -1; 182 183 while (bufpos < buf.length - 1) { 184 if (cpos == -1 || cpos == clen - 1) { 185 cpos = -1; 186 clen = readFromInputStreamFixed(c, in);//in.read(c); 187 188 if (clen == -1) { 189 break; 190 } 191 192 fileDigest.update(c, 0, clen); 193 } 194 195 bufpos++; 196 cpos++; 197 buf[bufpos] = c[cpos]; 198 199 if (bufpos < Tmin) { 200 continue; 201 } 202 else if (bufpos == Tmin) { 203 fingerprinter.check(buf, bufpos - windowSize, windowSize); 204 } 205 else { 206 fingerprinter.roll(buf[bufpos]); 207 } 208 209 int hash = fingerprinter.getValue(); 210 211 // The value of r (right side) plays no role! #39 212 if ((hash % Ddash) == Ddash - 1) { 213 backupBreak = bufpos; 214 } 215 216 if ((hash % D) == D - 1) { 217 breakpoint = bufpos; 218 break; 219 } 220 221 if (bufpos < Tmax) { 222 continue; 223 } 224 225 if (backupBreak != 0) { 226 breakpoint = backupBreak; 227 break; 228 } 229 else { 230 breakpoint = bufpos; 231 break; 232 } 233 } 234 235 // Close if this was the last bytes 236 if (clen == -1) { 237 in.close(); 238 closed = true; 239 } 240 241 // EOF as breakpoint 242 if (breakpoint == -1) { 243 breakpoint = bufpos; 244 } 245 246 // Increase breakpoint 247 breakpoint++; 248 249 // Create chunk 250 chunkDigest.update(buf, 0, breakpoint); 251 252 byte[] chunkChecksum = chunkDigest.digest(); 253 byte[] chunkContents = buf; 254 int chunkSize = breakpoint; 255 byte[] fileChecksum = (clen == -1) ? fileDigest.digest() : null; 256 257 return new Chunk(chunkChecksum, chunkContents, chunkSize, fileChecksum); 258 } 259 catch (IOException ex) { 260 logger.log(Level.SEVERE, "Error while retrieving next chunk.", ex); 261 return null; 262 } 263 } 264 265 @Override 266 public void close() { 267 try { 268 in.close(); 269 } 270 catch (IOException e) { 271 logger.log(Level.INFO, "Error while closing", e); 272 } 273 } 274 275 /** 276 * Fixes the read errors occurring with Cipher streams in the standard 277 * Java read implementation. 278 */ 279 private int readFromInputStreamFixed(byte[] readToBuffer, InputStream inputStream) throws IOException { 280 int bytesRead = 0; 281 282 while (bytesRead < readToBuffer.length) { 283 int byteRead = inputStream.read(); 284 285 if (byteRead == -1) { 286 return (bytesRead != 0) ? bytesRead : -1; 287 } 288 289 readToBuffer[bytesRead] = (byte) byteRead; 290 bytesRead++; 291 } 292 293 return (bytesRead != 0) ? bytesRead : -1; 294 } 295 } 296}