Skip to content

Commit 202e4d3

Browse files
committed
transforms args for cloud deploy
1 parent 865bfa8 commit 202e4d3

File tree

2 files changed

+23
-14
lines changed

2 files changed

+23
-14
lines changed

README.md

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,11 @@ and per https://github.com/GoogleCloudPlatform/training-data-analyst/tree/master
88
\
99
...
1010

11-
###
11+
### pkg_popularity_pipeline_local.py
12+
for local deploy \
13+
feed correct path with --input
14+
\
15+
### pkg_popularity_pipeline_cloud.py
16+
for dataflow deploy \
17+
reads and writes to Cloud Storage bucket \
18+
note BUCKET_ID & PROJECT_ID vars \

pkg_popularity_pipeline_cloud.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
import apache_beam as beam
55
import argparse
66

7+
PROJECT_ID = 'udemy-data-engineer-210920'
8+
BUCKET_ID = 'udemy-data-engineer-210920'
9+
BUCKET_FOLDER = 'dataflow-pipeline-py'
10+
11+
712
def find_matching_lines(line, keyword):
813

914
if line.startswith(keyword):
@@ -58,23 +63,20 @@ def compare_by_value(kv1, kv2):
5863

5964
def run():
6065

61-
parser = argparse.ArgumentParser(description='Find the most used Java packages')
62-
63-
parser.add_argument('--output_prefix',
64-
default='/tmp/output',
65-
help='Output prefix')
66-
67-
parser.add_argument('--input',
68-
default='../javahelp/src/main/java/com/google/cloud/training/dataanalyst/javahelp/',
69-
help='Input files location directory')
70-
71-
options, pipeline_args = parser.parse_known_args()
66+
argv = [
67+
'--project={0}'.format(PROJECT_ID),
68+
'--job_name=verygoodjob',
69+
'--save_main_session',
70+
'--staging_location=gs://{0}/{1}/staging/'.format(BUCKET_ID, BUCKET_FOLDER),
71+
'--temp_location=gs://{0}/{1}/staging/'.format(BUCKET_ID, BUCKET_FOLDER),
72+
'--runner=DataflowRunner'
73+
]
7274

7375
pipeline = beam.Pipeline(argv=pipeline_args)
7476

75-
input = '{0}*.java'.format(options.input)
77+
input = 'gs://{0}/{1}/input/*.java'.format(BUCKET_ID, BUCKET_FOLDER)
7678

77-
output_prefix = options.output_prefix
79+
output_prefix = 'gs://{0}/{1}/output'.format(BUCKET_ID, BUCKET_FOLDER)
7880

7981
keyword = 'import'
8082

0 commit comments

Comments
 (0)