Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
d8814c3
adds more logging to capture interesting numbers
hsanchez Dec 12, 2016
48f0995
automatically generating stopwords.. and some interesting refactorings
hsanchez Dec 15, 2016
57f4620
bunch of changes.. new workflow: first generate the stop list from al…
hsanchez Dec 16, 2016
1b75c6a
updates jar file
hsanchez Dec 16, 2016
ca22789
code cleanup..
hsanchez Dec 16, 2016
636806c
ensures that we only read the stop words file ONCE
hsanchez Dec 16, 2016
be01482
support to kestrel's data (at least for 347 chunks extracted from a 1…
hsanchez Dec 16, 2016
cd06838
some Java projects for netflix, facebook, square, pinterest, etc.
hsanchez Dec 16, 2016
97427d1
updates jar file
hsanchez Dec 16, 2016
c54345f
updates how we write files
hsanchez Dec 17, 2016
c934ba3
updates jar
hsanchez Dec 17, 2016
64154f7
deals with gson.fromJson return null
hsanchez Dec 17, 2016
6050e36
deals with gson.fromJson return null
hsanchez Dec 17, 2016
02dae10
updates jar
hsanchez Dec 17, 2016
47621b5
updates jar and adds filter to avoid creating clusters with no shared…
hsanchez Dec 17, 2016
698fbdb
updates partitions jar
hsanchez Dec 17, 2016
c01dd1c
code cleanup and more logging.
hsanchez Dec 18, 2016
51d1eca
code cleanup and more logging.
hsanchez Dec 18, 2016
52990f9
steps
hsanchez Dec 18, 2016
c2dd11d
threads
hsanchez Dec 18, 2016
ad5da11
threads
hsanchez Dec 18, 2016
36c2480
no threads
hsanchez Dec 18, 2016
c70f8d6
no threads
hsanchez Dec 18, 2016
16663af
addings steps
Dec 18, 2016
696a846
adds updated output
hsanchez Dec 18, 2016
5a6d2bf
not optimized version of ProcessProjects
hsanchez Dec 18, 2016
c083b07
Merge branch 'output' of https://github.com/aas-integration/partition…
Dec 18, 2016
4696c03
adds excel spreadsheet and cleaned up version of output.txt
hsanchez Dec 19, 2016
d7963ca
glossary of terms:
hsanchez Jan 18, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added data.xlsx
Binary file not shown.
Binary file modified lib/introspector-0.1.jar
Binary file not shown.
18,662 changes: 13,360 additions & 5,302 deletions output.txt

Large diffs are not rendered by default.

Binary file modified partitions.jar
Binary file not shown.
4 changes: 4 additions & 0 deletions src/main/java/com/vesperin/partition/BasicCli.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import com.github.rvesse.airline.Cli;
import com.github.rvesse.airline.builder.CliBuilder;
import com.github.rvesse.airline.parser.errors.ParseException;
import com.vesperin.partition.cmds.CommonWords;
import com.vesperin.partition.cmds.ProcessKestrelData;
import com.vesperin.partition.cmds.ProcessProjects;

import java.util.Objects;
Expand Down Expand Up @@ -123,6 +125,8 @@ default Cli<CliCommand> buildCli(){
return buildCli(Cli.<CliCommand>builder("vip")
.withDescription("Project Partitioning CLI")
.withCommand(ProcessProjects.class)
.withCommand(CommonWords.class)
.withCommand(ProcessKestrelData.class)
);
}

Expand Down
161 changes: 161 additions & 0 deletions src/main/java/com/vesperin/partition/cmds/CommonWords.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package com.vesperin.partition.cmds;

import com.github.rvesse.airline.HelpOption;
import com.github.rvesse.airline.annotations.Command;
import com.github.rvesse.airline.annotations.Option;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.primitives.Ints;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.vesperin.partition.BasicCli;
import com.vesperin.partition.spi.Git;
import com.vesperin.partition.utils.IO;
import com.vesperin.partition.utils.Jsons;
import com.vesperin.partition.utils.Projects;
import com.vesperin.text.Project;
import com.vesperin.text.Selection.Word;
import com.vesperin.text.spi.BasicExecutionMonitor;
import com.vesperin.text.spi.ExecutionMonitor;

import javax.inject.Inject;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;

import static java.util.stream.Collectors.toList;

/**
* @author Huascar Sanchez
*/
@SuppressWarnings("FieldCanBeLocal") @Command(name = "w", description = "Gathers common words across projects")
public class CommonWords implements BasicCli.CliCommand {
private final ExecutionMonitor MONITOR = BasicExecutionMonitor.get();

@Inject
HelpOption<ProcessProjects> help;

@Option(name = {"-f", "--from"}, arity = 1, description = "locates the corpus.json input")
private String from = null;

@Option(name = {"-d", "--dump"}, description = "dump words onto " + Jsons.STOPS + " file")
private boolean dump = false;

@Option(name = {"-t", "--to"}, arity = 1, description = "locates path of output folder")
private String to = null;

@Option(name = {"-v", "--verbose"}, description = "Prints logging messages")
private boolean verbose = false;

@Option(name = {"-k", "--topk"}, arity = 1, description = "Select top k words from each project. Default is 75.")
private int topk = 75;

@Option(name = {"-s", "--scope"}, arity = 1, description = "Search scope: (c)lassname (default), (m)ethodname, method (b)ody")
private String scope = "c";

@Override public Integer call() throws Exception {

if(!help.showHelpIfRequested()){
if(BasicCli.allNull(1, from)) {
System.err.println("Unable to locate corpus.json file.");
return -1;
}

if(BasicCli.allNull(1, to)) {
System.err.println("Unable to locate output folder.");
return -1;
}

if(verbose){ MONITOR.enable(); } else {
MONITOR.disable();
}

final Path corpusJson = Paths.get(from).toAbsolutePath();

if(!Files.exists(corpusJson)){
System.err.println(
String.format("ERROR: Unable to find %s ", corpusJson)
);

return -1;
}

final Path outDir = Paths.get(to).toAbsolutePath();

final List<String> projectNames = Git.processJson(corpusJson, outDir);
if(projectNames.isEmpty()){

System.err.println(
"ERROR: Unable to download github projects in " + corpusJson.toFile().getName()
);

return -1;
}

try {
final List<Project> projects = Projects.buildProjectsByFreQ(topk, scope, outDir, projectNames);

final Map<Word, Word> map = Maps.newHashMap();

for(Project p : projects){
for(Word w : p.wordSet()){

if(!map.containsKey(w)){
map.put(w, w);
} else {

final Word existing = map.get(w);
w.container().forEach(existing::add);
existing.count(w.value());

map.put(existing, existing);
}
}
}

final List<Word> words = map.keySet().stream()
.sorted((a, b) -> Ints.compare(b.value(), a.value()))
.filter(w -> w.value() >= 10)
.collect(toList());

final Map<String, List<String>> wordsMap = Maps.newHashMap();
wordsMap.put("stops", Lists.newArrayList());

words.forEach(w -> wordsMap.get("stops").add(w.element()));

final Gson gson = new GsonBuilder()
.setPrettyPrinting()
.create();


MONITOR.info(String.format("Total words collected is %d ", map.keySet().size()));
MONITOR.info(String.format("Top %d words selected from %d words", words.size(), map.keySet().size()));


if(!dump){
MONITOR.info(gson.toJson(wordsMap));

} else {

final Path newFile = Paths.get(outDir.toFile().getAbsolutePath() + "/" + Jsons.STOPS);
Files.deleteIfExists(newFile);

IO.writeFile(newFile, gson.toJson(wordsMap).getBytes());

MONITOR.info(String.format("%s was created.", outDir.toFile().getAbsolutePath() + "/" + Jsons.STOPS));
}

} catch (Exception e){
e.printStackTrace(System.err);
return -1;
}



}

return 0;
}
}
157 changes: 157 additions & 0 deletions src/main/java/com/vesperin/partition/cmds/ProcessKestrelData.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
package com.vesperin.partition.cmds;

import com.github.rvesse.airline.HelpOption;
import com.github.rvesse.airline.annotations.Command;
import com.github.rvesse.airline.annotations.Option;
import com.google.common.collect.Lists;
import com.vesperin.partition.BasicCli;
import com.vesperin.partition.utils.IO;
import com.vesperin.partition.utils.Threads;
import com.vesperin.text.Corpus;
import com.vesperin.text.Grouping;
import com.vesperin.text.Introspector;
import com.vesperin.text.Selection.Word;
import com.vesperin.text.spelling.StopWords;
import com.vesperin.text.spi.BasicExecutionMonitor;
import com.vesperin.text.spi.ExecutionMonitor;
import com.vesperin.text.tokenizers.Tokenizers;
import com.vesperin.text.tokenizers.WordsTokenizer;

import javax.inject.Inject;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collection;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* @author Huascar Sanchez
*/
@SuppressWarnings("FieldCanBeLocal") @Command(name = "k", description = "Process Kestrel's data of projects")
public class ProcessKestrelData implements BasicCli.CliCommand {

private static final ExecutionMonitor MONITOR = BasicExecutionMonitor.get();

@Inject HelpOption<ProcessProjects> help;

@Option(name = {"-d", "--directory"}, arity = 1, description = "directory containing JSON files to process.")
private String directory = null;

@Option(name = {"-v", "--verbose"}, description = "shows dumping-of-files status.")
private boolean verbose = false;


@Override public Integer call() throws Exception {
if(!help.showHelpIfRequested()){

if(BasicCli.allNull(1, directory)) {
System.err.println("-d | --directory <path/to/json-files> is missing.");
return -1;
}

final Path filesDirectory = Paths.get(directory).toAbsolutePath();

if(!Files.exists(filesDirectory)){
System.err.println(String.format("%s does not exist!", filesDirectory));
return -1;
}

if(verbose){ MONITOR.enable(); } else {
MONITOR.disable();
}

new ProcessJSONFiles(filesDirectory).process();

}

return 0;
}

private static class ProcessJSONFiles {
private final Path path;

ProcessJSONFiles(Path path){
this.path = path;
}

void process() {

final Corpus<String> global = Corpus.ofStrings();
final WordsTokenizer tokenizer = Tokenizers.tokenizeString(StopWords.all());

final Collection<Callable<Corpus<String>>> tasks = Lists.newArrayList();
final List<File> files = IO.collectFiles(path, "json");

MONITOR.info(String.format("About to process %d", files.size()));

files.forEach(c -> tasks.add(() -> buildCorpus(c)));

final ExecutorService service = Threads.scaleExecutor(files.size());

try {

final List<Future<Corpus<String>>> results = service.invokeAll(tasks);
for (Future<Corpus<String>> each : results){
global.add(each.get());
}
} catch (InterruptedException | ExecutionException e){
Thread.currentThread().interrupt();
}

Threads.shutdownService(service);

final List<Word> words = Introspector.typicalityQuery(500, global, tokenizer);
final Grouping.Groups groups = Grouping.groupDocsUsingWords(words);

if(MONITOR.isActive()){

MONITOR.info(String.format("Produced %d clusters", groups.size()));
final int avgSize = (groups.groupList().stream().mapToInt(Grouping.Group::size)).sum()/(groups.size());
final int singletons = (groups.groupList().stream().filter(c -> c.size() == 1).mapToInt(Grouping.Group::size)).sum();

MONITOR.info(String.format("Cluster average size: %d ", avgSize));
MONITOR.info(String.format("Total number of singleton clusters: %d ", singletons));

}

}

private static Corpus<String> buildCorpus(File file){
final Corpus<String> local = Corpus.ofStrings();

try {

final List<String> lines = Files.readAllLines(file.toPath());
if(lines.isEmpty()) return local;

final String line = lines.get(0);

Pattern logEntry = Pattern.compile("\\{(.*?)\\}");
Matcher matchPattern = logEntry.matcher(line);

while(matchPattern.find()) {
final String content = matchPattern.group(1);
int idx = content.lastIndexOf(":");
final String classname = content.substring(idx + 3, content.length() - 2);
if(classname.isEmpty() || classname.length() < 3)
continue;

local.add(content.substring(idx + 3, content.length() - 2));
}

} catch (IOException e) {
MONITOR.error("Unable to read " + file.getName(), e);
}

return local;
}
}
}
Loading