Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
I added a batch_size to the extract file, allowing for batch processi…
…ng of projects instead of loading them all at once. During batch processing, I also incorporated timeout handling.
  • Loading branch information
lidiancracy committed Sep 20, 2023
commit 103362bb62b1a4ed55e435320ee7e448d65ab304
52 changes: 21 additions & 31 deletions JavaExtractor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,22 @@
import itertools
import multiprocessing
import os
import sys
import shutil
import subprocess
from threading import Timer
import sys
from argparse import ArgumentParser
from subprocess import Popen, PIPE, STDOUT, call


def get_immediate_subdirectories(a_dir):
return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
if os.path.isdir(os.path.join(a_dir, name))]


def ParallelExtractDir(args, tmpdir, dir_):
ExtractFeaturesForDir(args,tmpdir, dir_, "")

ExtractFeaturesForDir(args, tmpdir, dir_, "")

def ExtractFeaturesForDir(args, tmpdir, dir_, prefix):
command = ['java', '-cp', args.jar, 'JavaExtractor.App',
'--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width),
'--dir', dir_, '--num_threads', str(args.num_threads)]
# print command
# os.system(command)
kill = lambda process: process.kill()
outputFileName = tmpdir + prefix + dir_.split('/')[-1]
failed = False
Expand All @@ -39,36 +31,35 @@ def ExtractFeaturesForDir(args, tmpdir, dir_, prefix):
finally:
timer.cancel()

if sleeper.poll() == 0:
if len(stderr) > 0:
print(sys.stderr, stderr, file=sys.stdout)
else:
print(sys.stderr, 'dir: ' + str(dir_) + ' was not completed in time', file=sys.stdout, flush=True)
if sleeper.poll() != 0:
failed = True
subdirs = get_immediate_subdirectories(dir_)
for subdir in subdirs:
ExtractFeaturesForDir(args, subdir, prefix + dir_.split('/')[-1] + '_')
if failed:
if os.path.exists(outputFileName):
os.remove(outputFileName)


if failed and os.path.exists(outputFileName):
os.remove(outputFileName)

def ExtractFeaturesForDirsList(args, dirs):
tmp_dir = f"./tmp/feature_extractor{os.getpid()}/"
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir, ignore_errors=True)
os.makedirs(tmp_dir)
try:
p = multiprocessing.Pool(4)
p.starmap(ParallelExtractDir, zip(itertools.repeat(args),itertools.repeat(tmp_dir), dirs))
#for dir in dirs:
# ExtractFeaturesForDir(args, dir, '')

for i in range(0, len(dirs), args.batch_size):
batch_dirs = dirs[i:i + args.batch_size]
timeout_seconds = 60 # timeout setting
try:
with multiprocessing.Pool(4) as p:
result = p.starmap_async(ParallelExtractDir, zip(itertools.repeat(args), itertools.repeat(tmp_dir), batch_dirs))
result.get(timeout=timeout_seconds)
except multiprocessing.TimeoutError:
continue

output_files = os.listdir(tmp_dir)
for f in output_files:
os.system("cat %s/%s" % (tmp_dir, f))
finally:
shutil.rmtree(tmp_dir, ignore_errors=True)

os.remove(os.path.join(tmp_dir, f))

if __name__ == '__main__':
parser = ArgumentParser()
Expand All @@ -78,6 +69,9 @@ def ExtractFeaturesForDirsList(args, dirs):
parser.add_argument("-j", "--jar", dest="jar", required=True)
parser.add_argument("-dir", "--dir", dest="dir", required=False)
parser.add_argument("-file", "--file", dest="file", required=False)
# add a new parameter batch_size
parser.add_argument("-batch_size", "--batch_size", dest="batch_size", required=False, default=3, type=int)

args = parser.parse_args()

if args.file is not None:
Expand All @@ -86,9 +80,5 @@ def ExtractFeaturesForDirsList(args, dirs):
os.system(command)
elif args.dir is not None:
subdirs = get_immediate_subdirectories(args.dir)
to_extract = subdirs
if len(subdirs) == 0:
to_extract = [args.dir.rstrip('/')]
to_extract = subdirs if subdirs else [args.dir.rstrip('/')]
ExtractFeaturesForDirsList(args, to_extract)