Skip to content

Instantly share code, notes, and snippets.

@nobodxbodon
Last active September 19, 2019 22:51
Show Gist options
  • Save nobodxbodon/5d805f067eadf2cb31adc3a790742e4b to your computer and use it in GitHub Desktop.
Save nobodxbodon/5d805f067eadf2cb31adc3a790742e4b to your computer and use it in GitHub Desktop.
package com.github.program_in_chinese.工具;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.function.Consumer;
public class 方舟标识符统计 {
private final static String 提取路径 = "/Users/xuanwu/work/方舟/标识符提取_0911/";
// maple_driver/src/file_utils.cpp
private final static Set<String> 非标识符 = new HashSet<>();
private final static Set<String> 关键字 = new HashSet<>();
private final static Set<String> 库 = new HashSet<>();
private final static Set<String> 特殊符号 = new HashSet<>();
static {
非标识符.addAll(Arrays.asList(
"OR", "and", "the", "to", "is", "of", "a", "can", "for", "more", "copy", "may", "according",
"v1", "use", "c", "ANY", "INCLUDING", "OpenArkCompiler", "WARRANTIES", "WITHOUT", "KIND",
"licensed", "software", "NOT", "SOFTWARE", "MERCHANTABILITY", "Mulan", "EITHER", "obtain",
"BASIS", "Technologies", "You", "LIMITED", "IMPLIED", "PARTICULAR", "FOR", "FIT", "Huawei",
"terms", "at", "TO", "ON", "OF", "rights", "IS", "http", "AS", "AN", "EXPRESS", "THIS",
"PROVIDED", "under", "See", "BUT", "Copyright", "A", "in", "not", "be", "by", "or", "The",
"with", "If", "will", "it", "i", "from", "that", "no", "as", "has", "an", "RETURN", "are",
"FUNCTION", "VALUE", "INPUT", "PARAMETERS", "DESCRIPTION", "OUTPUT", "all", "when", "have",
"buffer", "into", "conditions", "only", "out", "call", "we", "need", "should", "array",
"must", "there", "used", "characters", "defined", "character", "does", "address", "been",
"then", "but", "same", "code", "stored", "given", "This", "any", "so", "than", "other",
"strings", "returns", "here", "on", "which", "some", "because", "were", "its"
));
关键字.addAll(Arrays.asList(
"this", "include", "if", "return", "const", "else", "endif", "function", "size_t", "namespace",
"false", "type", "int", "null", "true", "while", "default", "char", "bool", "case", "switch",
"break", "void", "New", "define", "continue", "do", "class", "new", "ifdef", "wchar_t", "sizeof",
"unsigned", "undef", "static"
));
库.addAll(Arrays.asList(
"string", "std", "nullptr", "static_cast", "set", "endl", "vector", "iostream"));
}
private final static Map<String, Integer> 去重标识符 = new HashMap<>();
public static void main(String[] args) throws Exception {
File 提取标识符目录 = new File(提取路径);
List<String> 所有标识符 = 遍历(提取标识符目录);
for (String 标识符 : 所有标识符) {
if (去重标识符.containsKey(标识符)) {
去重标识符.put(标识符, 去重标识符.get(标识符) + 1);
} else {
去重标识符.put(标识符, 1);
}
}
System.out.println(去重标识符.size() + " 共: " + 所有标识符.size());
List<Entry<String, Integer>> list = new ArrayList<>(去重标识符.entrySet());
list.sort(Entry.comparingByValue());
int 前几位 = 50000;
int 累计词数 = 0;
for (int i = list.size() - 1; i >= 0 && 前几位 >= 0; i--, 前几位--) {
Entry<String, Integer> 词频 = list.get(i);
int 词数 = 词频.getValue();
累计词数 += 词数;
System.out.println(词频.getKey() + "|" + 词数 + "|" + 累计词数 + "|" + 累计词数/22965.0 * 100);
}
}
private static List<String> 遍历(File 目录) throws Exception {
List<String> 标识符 = new ArrayList<>();
if (目录.isDirectory()) {
if (目录.getName().equals("bin") || 目录.getName().equals("deplibs")) {
return 标识符;
}
for (File 子目录 : 目录.listFiles()) {
标识符.addAll(遍历(子目录));
System.out.println(标识符.size() + " " + 子目录.getAbsolutePath());
}
} else if (目录.isFile()) {
String 名称 = 目录.getName();
if (名称.equals("BUILD.gn") || 名称.equals("Makefile") || 名称.endsWith(".h")) {
return 标识符;
}
List<String> 行 = 取行(目录);
List<String> 清理后标识符 = new ArrayList<>();
for (String 某行 : 行.subList(7, 行.size())) {
清理后标识符.addAll(清理(某行));
}
标识符.addAll(清理后标识符);
System.out.println(标识符.size() + " " + 目录.getAbsolutePath());
}
return 标识符;
}
private static List<String> 清理(String 某行) {
List<String> 所有标识符 = new ArrayList<>();
String[] 分割 = 某行.split("\t");
if (分割.length == 2 && !分割[1].isEmpty()) {
String 未处理标识符 = 分割[1];
if (未处理标识符.startsWith("&") || 未处理标识符.startsWith("!")) {
未处理标识符 = 未处理标识符.substring(1);
}
if (未处理标识符.indexOf(".") >=0) {
String[] 再分割 = 未处理标识符.split(".");
for (String 标识符 : 再分割) {
所有标识符.add(标识符);
}
} else {
所有标识符.add(未处理标识符);
}
}
List<String> 过滤后标识符 = new ArrayList<>();
for (String 某标识符 : 所有标识符) {
// 按照标识符命名规则过滤, 如不能是数字, 不包含'-', 单引号, /\等等
if (某标识符.isEmpty() || 某标识符.matches("\\d+") || 某标识符.contains("-") || 某标识符.contains("'")
|| 某标识符.contains("/") || 某标识符.contains("\\") || 某标识符.contains("&") || 某标识符.contains("!")
|| 某标识符.contains("~") || 某标识符.contains("?")
|| 非标识符.contains(某标识符) || 关键字.contains(某标识符) || 库.contains(某标识符)
|| 某标识符.length() < 2) {
continue;
}
过滤后标识符.add(某标识符);
}
return 过滤后标识符;
}
public static List<String> 取行(File 文件) throws Exception {
List<String> 行 = new ArrayList<>();
Files.lines(文件.toPath(), StandardCharsets.ISO_8859_1).forEach(new Consumer<String>() {
@Override
public void accept(String 文本) {
行.add(文本);
}
});
return 行;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment