|
4 | 4 | import apache_beam as beam |
5 | 5 | import argparse |
6 | 6 |
|
| 7 | +PROJECT_ID = 'udemy-data-engineer-210920' |
| 8 | +BUCKET_ID = 'udemy-data-engineer-210920' |
| 9 | +BUCKET_FOLDER = 'dataflow-pipeline-py' |
| 10 | + |
| 11 | + |
7 | 12 | def find_matching_lines(line, keyword): |
8 | 13 |
|
9 | 14 | if line.startswith(keyword): |
@@ -58,23 +63,20 @@ def compare_by_value(kv1, kv2): |
58 | 63 |
|
59 | 64 | def run(): |
60 | 65 |
|
61 | | - parser = argparse.ArgumentParser(description='Find the most used Java packages') |
62 | | - |
63 | | - parser.add_argument('--output_prefix', |
64 | | - default='/tmp/output', |
65 | | - help='Output prefix') |
66 | | - |
67 | | - parser.add_argument('--input', |
68 | | - default='../javahelp/src/main/java/com/google/cloud/training/dataanalyst/javahelp/', |
69 | | - help='Input files location directory') |
70 | | - |
71 | | - options, pipeline_args = parser.parse_known_args() |
| 66 | + argv = [ |
| 67 | + '--project={0}'.format(PROJECT_ID), |
| 68 | + '--job_name=verygoodjob', |
| 69 | + '--save_main_session', |
| 70 | + '--staging_location=gs://{0}/{1}/staging/'.format(BUCKET_ID, BUCKET_FOLDER), |
| 71 | + '--temp_location=gs://{0}/{1}/staging/'.format(BUCKET_ID, BUCKET_FOLDER), |
| 72 | + '--runner=DataflowRunner' |
| 73 | + ] |
72 | 74 |
|
73 | 75 | pipeline = beam.Pipeline(argv=pipeline_args) |
74 | 76 |
|
75 | | - input = '{0}*.java'.format(options.input) |
| 77 | + input = 'gs://{0}/{1}/input/*.java'.format(BUCKET_ID, BUCKET_FOLDER) |
76 | 78 |
|
77 | | - output_prefix = options.output_prefix |
| 79 | + output_prefix = 'gs://{0}/{1}/output'.format(BUCKET_ID, BUCKET_FOLDER) |
78 | 80 |
|
79 | 81 | keyword = 'import' |
80 | 82 |
|
|
0 commit comments