Skip to content

Instantly share code, notes, and snippets.

@dozortsev
Created March 16, 2014 16:19
Show Gist options
  • Save dozortsev/9585702 to your computer and use it in GitHub Desktop.
Save dozortsev/9585702 to your computer and use it in GitHub Desktop.
File Scanner
package taskB;
import org.apache.log4j.Logger;
import java.io.*;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
public class FileScanner {
private final String path;
private static final int BUFFER_SIZE_SMALL = 1024; // 1024 byte
private static final int BUFFER_SIZE_MEDIUM = 1048576; // 1 mb
private static final int BUFFER_SIZE_BIG = 10485760; // 10 mb
private static Logger log = Logger.getLogger(FileScanner.class);
/*
* Data structure where keys is a size of file and
* value is list of canonical path to mapFiles the same size
*/
private Map<Long, ArrayList<String>> mapFiles;
/*
* Default constructor
*/
public FileScanner(String path) {
this.path = path;
mapFiles = new HashMap<>();
}
/*
* Constructor with the specified initial capacity
*/
public FileScanner(String path, int capacity) {
this.path = path;
mapFiles = new HashMap<>(capacity);
}
/*
* Getter for path
*/
String getPath() {
return path;
}
private String toCanonicalPath(File file) {
try {
return file.getCanonicalPath();
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/*
* Get an input stream that reads bytes from a file
*/
protected InputStream getInputStream(File file) throws FileNotFoundException {
return new BufferedInputStream(new FileInputStream(file));
}
/*
* Define buffer size by file length
*/
protected int defineBufferSize(long length) {
if (length < BUFFER_SIZE_MEDIUM) // file size less than 1mb
return BUFFER_SIZE_SMALL; // 1bt
if (length < BUFFER_SIZE_BIG) // file size less than 10mb
return BUFFER_SIZE_SMALL * 10; // 10bt
if (length < BUFFER_SIZE_BIG * 10) // file size less than 100mb
return BUFFER_SIZE_MEDIUM; // 1mb
if (length < BUFFER_SIZE_BIG * 100) // file size less than 1gb
return BUFFER_SIZE_BIG; // 10mb
return BUFFER_SIZE_BIG * 10; // 100mb
}
/*
* Search similar files by length in the directory and subdirectories
*/
private void scanner(String path) {
File[] subDirs = new File(path).listFiles(new FileFilter() {
@Override
public boolean accept(final File file) {
if (file.isFile() && file.canRead()) {
long size = file.length(); // length of the file is a key in map
if (mapFiles.containsKey(size)) {
mapFiles.get(size).add(toCanonicalPath(file));
}
else {
mapFiles.put(size, new ArrayList<String>(25) {{
add(toCanonicalPath(file));
}});
}
return false;
}
return file.isDirectory() && file.canRead() && !Files.isSymbolicLink(file.toPath());
}
});
for (int i = 0; i < subDirs.length; i++)
scanner(toCanonicalPath(subDirs[i]));
}
/*
* Compare binary files
*/
protected boolean compareFiles(String path1, String path2) {
if (path1.equals(path2)) return false;
boolean isSimilar = true;
final File f1 = new File(path1), f2 = new File(path2);
int size = defineBufferSize(f1.length());
byte[] bytesF1 = new byte[size], bytesF2 = new byte[size];
try (InputStream in1 = getInputStream(f1); InputStream in2 = getInputStream(f2)) {
while (in1.read(bytesF1) != -1 && in2.read(bytesF2) != -1) {
if (!Arrays.equals(bytesF1, bytesF2)) {
isSimilar = false;
break;
}
}
} catch (IOException e) {
log.error("Error:", e);
}
return isSimilar;
}
public void searchFiles() {
scanner(path);
for (ArrayList<String> paths : mapFiles.values()) {
if (paths.size() == 1) continue;
for (int i = 0; i < paths.size(); i++) {
String path1 = paths.get(i);
boolean isFound = false;
for (int j = 0; j < paths.size();) {
String path2 = paths.get(j);
if (compareFiles(path1, path2)) {
log.info(path2);
isFound = paths.remove(path2);
} else {
j++;
}
}
if (isFound) log.info(path1 + "\n");
paths.remove(path1);
}
}
}
}
# Root logger option
log4j.rootLogger=INFO, file
# Direct log messages to a log file
log4j.appender.file=org.apache.log4j.RollingFileAppender
log4j.appender.file.File=log/files.log
log4j.appender.file.MaxFileSize=4MB
log4j.appender.file.MaxBackupIndex=1
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=%m%n
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment