-
-
Save nobodxbodon/5d805f067eadf2cb31adc3a790742e4b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.github.program_in_chinese.工具; | |
import java.io.File; | |
import java.nio.charset.StandardCharsets; | |
import java.nio.file.Files; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.Map.Entry; | |
import java.util.Set; | |
import java.util.function.Consumer; | |
public class 方舟标识符统计 { | |
private final static String 提取路径 = "/Users/xuanwu/work/方舟/标识符提取_0911/"; | |
// maple_driver/src/file_utils.cpp | |
private final static Set<String> 非标识符 = new HashSet<>(); | |
private final static Set<String> 关键字 = new HashSet<>(); | |
private final static Set<String> 库 = new HashSet<>(); | |
private final static Set<String> 特殊符号 = new HashSet<>(); | |
static { | |
非标识符.addAll(Arrays.asList( | |
"OR", "and", "the", "to", "is", "of", "a", "can", "for", "more", "copy", "may", "according", | |
"v1", "use", "c", "ANY", "INCLUDING", "OpenArkCompiler", "WARRANTIES", "WITHOUT", "KIND", | |
"licensed", "software", "NOT", "SOFTWARE", "MERCHANTABILITY", "Mulan", "EITHER", "obtain", | |
"BASIS", "Technologies", "You", "LIMITED", "IMPLIED", "PARTICULAR", "FOR", "FIT", "Huawei", | |
"terms", "at", "TO", "ON", "OF", "rights", "IS", "http", "AS", "AN", "EXPRESS", "THIS", | |
"PROVIDED", "under", "See", "BUT", "Copyright", "A", "in", "not", "be", "by", "or", "The", | |
"with", "If", "will", "it", "i", "from", "that", "no", "as", "has", "an", "RETURN", "are", | |
"FUNCTION", "VALUE", "INPUT", "PARAMETERS", "DESCRIPTION", "OUTPUT", "all", "when", "have", | |
"buffer", "into", "conditions", "only", "out", "call", "we", "need", "should", "array", | |
"must", "there", "used", "characters", "defined", "character", "does", "address", "been", | |
"then", "but", "same", "code", "stored", "given", "This", "any", "so", "than", "other", | |
"strings", "returns", "here", "on", "which", "some", "because", "were", "its" | |
)); | |
关键字.addAll(Arrays.asList( | |
"this", "include", "if", "return", "const", "else", "endif", "function", "size_t", "namespace", | |
"false", "type", "int", "null", "true", "while", "default", "char", "bool", "case", "switch", | |
"break", "void", "New", "define", "continue", "do", "class", "new", "ifdef", "wchar_t", "sizeof", | |
"unsigned", "undef", "static" | |
)); | |
库.addAll(Arrays.asList( | |
"string", "std", "nullptr", "static_cast", "set", "endl", "vector", "iostream")); | |
} | |
private final static Map<String, Integer> 去重标识符 = new HashMap<>(); | |
public static void main(String[] args) throws Exception { | |
File 提取标识符目录 = new File(提取路径); | |
List<String> 所有标识符 = 遍历(提取标识符目录); | |
for (String 标识符 : 所有标识符) { | |
if (去重标识符.containsKey(标识符)) { | |
去重标识符.put(标识符, 去重标识符.get(标识符) + 1); | |
} else { | |
去重标识符.put(标识符, 1); | |
} | |
} | |
System.out.println(去重标识符.size() + " 共: " + 所有标识符.size()); | |
List<Entry<String, Integer>> list = new ArrayList<>(去重标识符.entrySet()); | |
list.sort(Entry.comparingByValue()); | |
int 前几位 = 50000; | |
int 累计词数 = 0; | |
for (int i = list.size() - 1; i >= 0 && 前几位 >= 0; i--, 前几位--) { | |
Entry<String, Integer> 词频 = list.get(i); | |
int 词数 = 词频.getValue(); | |
累计词数 += 词数; | |
System.out.println(词频.getKey() + "|" + 词数 + "|" + 累计词数 + "|" + 累计词数/22965.0 * 100); | |
} | |
} | |
private static List<String> 遍历(File 目录) throws Exception { | |
List<String> 标识符 = new ArrayList<>(); | |
if (目录.isDirectory()) { | |
if (目录.getName().equals("bin") || 目录.getName().equals("deplibs")) { | |
return 标识符; | |
} | |
for (File 子目录 : 目录.listFiles()) { | |
标识符.addAll(遍历(子目录)); | |
System.out.println(标识符.size() + " " + 子目录.getAbsolutePath()); | |
} | |
} else if (目录.isFile()) { | |
String 名称 = 目录.getName(); | |
if (名称.equals("BUILD.gn") || 名称.equals("Makefile") || 名称.endsWith(".h")) { | |
return 标识符; | |
} | |
List<String> 行 = 取行(目录); | |
List<String> 清理后标识符 = new ArrayList<>(); | |
for (String 某行 : 行.subList(7, 行.size())) { | |
清理后标识符.addAll(清理(某行)); | |
} | |
标识符.addAll(清理后标识符); | |
System.out.println(标识符.size() + " " + 目录.getAbsolutePath()); | |
} | |
return 标识符; | |
} | |
private static List<String> 清理(String 某行) { | |
List<String> 所有标识符 = new ArrayList<>(); | |
String[] 分割 = 某行.split("\t"); | |
if (分割.length == 2 && !分割[1].isEmpty()) { | |
String 未处理标识符 = 分割[1]; | |
if (未处理标识符.startsWith("&") || 未处理标识符.startsWith("!")) { | |
未处理标识符 = 未处理标识符.substring(1); | |
} | |
if (未处理标识符.indexOf(".") >=0) { | |
String[] 再分割 = 未处理标识符.split("."); | |
for (String 标识符 : 再分割) { | |
所有标识符.add(标识符); | |
} | |
} else { | |
所有标识符.add(未处理标识符); | |
} | |
} | |
List<String> 过滤后标识符 = new ArrayList<>(); | |
for (String 某标识符 : 所有标识符) { | |
// 按照标识符命名规则过滤, 如不能是数字, 不包含'-', 单引号, /\等等 | |
if (某标识符.isEmpty() || 某标识符.matches("\\d+") || 某标识符.contains("-") || 某标识符.contains("'") | |
|| 某标识符.contains("/") || 某标识符.contains("\\") || 某标识符.contains("&") || 某标识符.contains("!") | |
|| 某标识符.contains("~") || 某标识符.contains("?") | |
|| 非标识符.contains(某标识符) || 关键字.contains(某标识符) || 库.contains(某标识符) | |
|| 某标识符.length() < 2) { | |
continue; | |
} | |
过滤后标识符.add(某标识符); | |
} | |
return 过滤后标识符; | |
} | |
public static List<String> 取行(File 文件) throws Exception { | |
List<String> 行 = new ArrayList<>(); | |
Files.lines(文件.toPath(), StandardCharsets.ISO_8859_1).forEach(new Consumer<String>() { | |
@Override | |
public void accept(String 文本) { | |
行.add(文本); | |
} | |
}); | |
return 行; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment