Skip to content
62 changes: 36 additions & 26 deletions CSharpExtractor/CSharpExtractor/Extractor/Extractor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;


namespace Extractor
{
Expand All @@ -27,12 +29,14 @@ public class Extractor
public int WidthLimit { get; set; }
public string Code { get; set; }
public bool ShouldHash { get; set; }
public int MaxContexts { get; set; }

public Extractor(string code, Options opts)
public Extractor(string code, Options opts)
{
LengthLimit = opts.MaxLength;
WidthLimit = opts.MaxWidth;
ShouldHash = !opts.NoHash;
MaxContexts = opts.MaxContexts;
Code = code;
}

Expand Down Expand Up @@ -104,29 +108,32 @@ private string PathToString(PathFinder.Path path)
return builder.ToString();
}

internal IEnumerable<PathFinder.Path> GetInternalPaths(Tree tree)
{
internal IEnumerable<PathFinder.Path> GetInternalPaths(Tree tree)
{
var finder = new PathFinder(tree, LengthLimit, WidthLimit);
foreach (Tuple<Variable, Variable> varPair in
Utilities.WeakConcat(Utilities.Choose2(variables),
variables.Select((arg) => new Tuple<Variable,Variable>(arg,arg))))
{
bool pathToSelf = varPair.Item1 == varPair.Item2;

foreach(var lhs in varPair.Item1.Leaves)
foreach (var rhs in varPair.Item2.Leaves)
{
if (lhs == rhs)
continue;

PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true);

if (path == null)
continue;


yield return path;
}

var allPairs = Utilities.ReservoirSample(Utilities.WeakConcat(Utilities.Choose2(variables),
variables.Select((arg) => new Tuple<Variable, Variable>(arg, arg))), MaxContexts);

//iterate over variable-variable pairs
foreach (Tuple<Variable, Variable> varPair in allPairs)
{
bool pathToSelf = varPair.Item1 == varPair.Item2;

foreach (var rhs in varPair.Item2.Leaves)
foreach (var lhs in varPair.Item1.Leaves)
{

if (lhs == rhs)
continue;

PathFinder.Path path = finder.FindPath(lhs, rhs, limited: true);

if (path == null)
continue;

yield return path;
}
}
}

Expand Down Expand Up @@ -167,6 +174,7 @@ public List<String> Extract()
List<String> results = new List<string>();

foreach(var method in methods) {

String methodName = method.Identifier.ValueText;
Tree methodTree = new Tree(method);
var subtokensMethodName = Utilities.SplitToSubtokens(methodName);
Expand All @@ -185,10 +193,12 @@ public List<String> Extract()

foreach (PathFinder.Path path in GetInternalPaths(methodTree))
{
contexts.Add(SplitNameUnlessEmpty(tokenToVar[path.Left].Name)
String pathString = SplitNameUnlessEmpty(tokenToVar[path.Left].Name)
+ "," + MaybeHash(this.PathNodesToString(path))
+ "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name));
+ "," + SplitNameUnlessEmpty(tokenToVar[path.Right].Name);

Debug.WriteLine(path.Left.FullSpan+" "+tokenToVar[path.Left].Name+ "," +this.PathNodesToString(path)+ "," + tokenToVar[path.Right].Name+" "+path.Right.FullSpan);
contexts.Add(pathString);
}

var commentNodes = tree.GetRoot().DescendantTrivia().Where(
Expand All @@ -206,7 +216,7 @@ public List<String> Extract()
contexts.Add(batch + "," + "COMMENT" + "," + batch);
}
}
results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts));
results.Add(String.Join("|", subtokensMethodName) + " " + String.Join(" ", contexts));
}
return results;
}
Expand Down
7 changes: 5 additions & 2 deletions CSharpExtractor/CSharpExtractor/Extractor/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,12 @@ static void Main(string[] args)

results = files.AsParallel().WithDegreeOfParallelism(options.Threads).SelectMany(filename => ExtractSingleFile(filename, options));

foreach (var res in results)
using (StreamWriter sw = new StreamWriter(options.OFileName, append: true))
{
Console.WriteLine(res);
foreach (var res in results)
{
sw.WriteLine(res);
}
}
}
}
Expand Down
45 changes: 43 additions & 2 deletions CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Diagnostics;
using System.Text.RegularExpressions;

namespace Extractor
Expand All @@ -21,11 +22,17 @@ public class Options
[Option('l', "max_width", Default = 2, HelpText = "Max path length")]
public int MaxWidth { get; set; }

[Option('o', "ofile_name", Default = "test.txt", HelpText = "Output file name")]
public String OFileName { get; set; }

[Option('h', "no_hash", Default = false, HelpText = "When enabled, prints the whole path strings (not hashed)")]
public Boolean NoHash { get; set; }

[Option('l', "max_contexts", Default = 30000, HelpText = "Max number of path contexts to sample. Affects only very large snippets")]
public int MaxContexts { get; set; }
}

public class Utilities
public static class Utilities
{
public static String[] NumbericLiteralsToKeep = new String[] { "0", "1", "2", "3", "4", "5", "10" };
public static IEnumerable<Tuple<T, T>> Choose2<T>(IEnumerable<T> enumerable)
Expand All @@ -40,7 +47,41 @@ public static IEnumerable<Tuple<T, T>> Choose2<T>(IEnumerable<T> enumerable)
}
}

public static IEnumerable<T> WeakConcat<T>(IEnumerable<T> enumerable1, IEnumerable<T> enumerable2)
/// <summary>
/// Sample uniform randomly numSamples from an enumerable, using reservoir sampling.
/// See https://en.wikipedia.org/wiki/Reservoir_sampling
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="input"></param>
/// <param name="numSamples"></param>
/// <returns></returns>
public static IEnumerable<TSource> ReservoirSample<TSource>(this IEnumerable<TSource> input, int numSamples)
{
var rng = new Random();
var sampledElements = new List<TSource>(numSamples);
int seenElementCount = 0;
foreach (var element in input)
{
seenElementCount++;
if (sampledElements.Count < numSamples)
{
sampledElements.Add(element);
}
else
{
int position = rng.Next(seenElementCount);
if (position < numSamples)
{
sampledElements[position] = element;
}
}
}
Debug.Assert(sampledElements.Count <= numSamples);
return sampledElements;
}


public static IEnumerable<T> WeakConcat<T>(IEnumerable<T> enumerable1, IEnumerable<T> enumerable2)
{
foreach (T t in enumerable1)
yield return t;
Expand Down
11 changes: 9 additions & 2 deletions CSharpExtractor/CSharpExtractor/Extractor/Variable.cs
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,15 @@ internal static IEnumerable<Variable> CreateFromMethod(Tree methodTree)
string name = tokenToName[leaf];
SyntaxToken[] syntaxTokens = nameToTokens[name].ToArray();
var v = new Variable(name, syntaxTokens, methodTree);
results.Add(v);
}

//check if exists
var matches = results.Where(p => p.Name == name).ToList();
bool alreadyExists = (matches.Count != 0);
if (!alreadyExists)
{
results.Add(v);
}
}

return results;
}
Expand Down
49 changes: 22 additions & 27 deletions CSharpExtractor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,35 +27,30 @@ def ParallelExtractDir(args, dir):
def ExtractFeaturesForDir(args, dir, prefix):
command = ['dotnet', 'run', '--project', args.csproj,
'--max_length', str(args.max_path_length), '--max_width', str(args.max_path_width),
'--path', dir, '--threads', str(args.num_threads)]
'--path', dir, '--threads', str(args.num_threads), '--ofile_name', str(args.ofile_name)]


# print command
# os.system(command)
kill = lambda process: process.kill()
outputFileName = TMP_DIR + prefix + dir.split('/')[-1]
failed = False
with open(outputFileName, 'a') as outputFile:
sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE)
timer = Timer(600000, kill, [sleeper])

try:
timer.start()
stdout, stderr = sleeper.communicate()
finally:
timer.cancel()

if sleeper.poll() == 0:
if len(stderr) > 0:
print(sys.stderr, stderr, file=sys.stdout)
else:
print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time', file=sys.stdout)
failed = True
subdirs = get_immediate_subdirectories(dir)
for subdir in subdirs:
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
if failed:
if os.path.exists(outputFileName):
os.remove(outputFileName)
sleeper = subprocess.Popen(command, stderr=subprocess.PIPE)
timer = Timer(600000, kill, [sleeper])

try:
timer.start()
_, stderr = sleeper.communicate()
finally:
timer.cancel()

if sleeper.poll() == 0:
if len(stderr) > 0:
print(sys.stderr, stderr)
else:
print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time')
failed = True
subdirs = get_immediate_subdirectories(dir)
for subdir in subdirs:
ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')


def ExtractFeaturesForDirsList(args, dirs):
Expand All @@ -77,12 +72,14 @@ def ExtractFeaturesForDirsList(args, dirs):


if __name__ == '__main__':

parser = ArgumentParser()
parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
parser.add_argument("--csproj", dest="csproj", required=True)
parser.add_argument("-dir", "--dir", dest="dir", required=False)
parser.add_argument("-ofile_name", "--ofile_name", dest="ofile_name", required=True)
args = parser.parse_args()

if args.dir is not None:
Expand All @@ -91,5 +88,3 @@ def ExtractFeaturesForDirsList(args, dirs):
if len(subdirs) == 0:
to_extract = [args.dir.rstrip('/')]
ExtractFeaturesForDirsList(args, to_extract)


6 changes: 3 additions & 3 deletions preprocess_csharp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ mkdir -p data
mkdir -p data/${DATASET_NAME}

echo "Extracting paths from validation set..."
${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${VAL_DATA_FILE}
${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${VAL_DATA_FILE}
echo "Finished extracting paths from validation set"
echo "Extracting paths from test set..."
${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${TEST_DATA_FILE}
${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TEST_DATA_FILE}
echo "Finished extracting paths from test set"
echo "Extracting paths from training set..."
${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} > ${TRAIN_DATA_FILE}
${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TRAIN_DATA_FILE}
echo "Finished extracting paths from training set"

TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2v
Expand Down