|
| 1 | +""" |
| 2 | +Integrations with the AWS Glue Job. |
| 3 | +
|
| 4 | +""" |
| 5 | +import time |
| 6 | +from typing import Any, Optional |
| 7 | + |
| 8 | +from prefect.blocks.abstract import JobBlock, JobRun |
| 9 | +from pydantic import VERSION as PYDANTIC_VERSION |
| 10 | + |
| 11 | +if PYDANTIC_VERSION.startswith("2."): |
| 12 | + from pydantic.v1 import BaseModel, Field |
| 13 | +else: |
| 14 | + from pydantic import BaseModel, Field |
| 15 | + |
| 16 | +from prefect_aws import AwsCredentials |
| 17 | + |
| 18 | +_GlueJobClient = Any |
| 19 | + |
| 20 | + |
| 21 | +class GlueJobRun(JobRun, BaseModel): |
| 22 | + """Execute a Glue Job""" |
| 23 | + |
| 24 | + job_name: str = Field( |
| 25 | + ..., |
| 26 | + title="AWS Glue Job Name", |
| 27 | + description="The name of the job definition to use.", |
| 28 | + ) |
| 29 | + |
| 30 | + job_id: str = Field( |
| 31 | + ..., |
| 32 | + title="AWS Glue Job ID", |
| 33 | + description="The ID of the job run.", |
| 34 | + ) |
| 35 | + |
| 36 | + job_watch_poll_interval: float = Field( |
| 37 | + default=60.0, |
| 38 | + description=( |
| 39 | + "The amount of time to wait between AWS API calls while monitoring the " |
| 40 | + "state of an Glue Job." |
| 41 | + ), |
| 42 | + ) |
| 43 | + |
| 44 | + _error_states = ["FAILED", "STOPPED", "ERROR", "TIMEOUT"] |
| 45 | + |
| 46 | + aws_credentials: AwsCredentials = Field( |
| 47 | + title="AWS Credentials", |
| 48 | + default_factory=AwsCredentials, |
| 49 | + description="The AWS credentials to use to connect to Glue.", |
| 50 | + ) |
| 51 | + |
| 52 | + client: _GlueJobClient = Field(default=None, description="") |
| 53 | + |
| 54 | + async def fetch_result(self) -> str: |
| 55 | + """fetch glue job state""" |
| 56 | + job = self._get_job_run() |
| 57 | + return job["JobRun"]["JobRunState"] |
| 58 | + |
| 59 | + def wait_for_completion(self) -> None: |
| 60 | + """ |
| 61 | + Wait for the job run to complete and get exit code |
| 62 | + """ |
| 63 | + self.logger.info(f"watching job {self.job_name} with run id {self.job_id}") |
| 64 | + while True: |
| 65 | + job = self._get_job_run() |
| 66 | + job_state = job["JobRun"]["JobRunState"] |
| 67 | + if job_state in self._error_states: |
| 68 | + # Generate a dynamic exception type from the AWS name |
| 69 | + self.logger.error(f"job failed: {job['JobRun']['ErrorMessage']}") |
| 70 | + raise RuntimeError(job["JobRun"]["ErrorMessage"]) |
| 71 | + elif job_state == "SUCCEEDED": |
| 72 | + self.logger.info(f"job succeeded: {self.job_id}") |
| 73 | + break |
| 74 | + |
| 75 | + time.sleep(self.job_watch_poll_interval) |
| 76 | + |
| 77 | + def _get_job_run(self): |
| 78 | + """get glue job""" |
| 79 | + return self.client.get_job_run(JobName=self.job_name, RunId=self.job_id) |
| 80 | + |
| 81 | + |
| 82 | +class GlueJobBlock(JobBlock): |
| 83 | + """Execute a job to the AWS Glue Job service. |
| 84 | +
|
| 85 | + Attributes: |
| 86 | + job_name: The name of the job definition to use. |
| 87 | + arguments: The job arguments associated with this run. |
| 88 | + For this job run, they replace the default arguments set in the job |
| 89 | + definition itself. |
| 90 | + You can specify arguments here that your own job-execution script consumes, |
| 91 | + as well as arguments that Glue itself consumes. |
| 92 | + Job arguments may be logged. Do not pass plaintext secrets as arguments. |
| 93 | + Retrieve secrets from a Glue Connection, Secrets Manager or other secret |
| 94 | + management mechanism if you intend to keep them within the Job. |
| 95 | + [doc](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html) |
| 96 | + job_watch_poll_interval: The amount of time to wait between AWS API |
| 97 | + calls while monitoring the state of a Glue Job. |
| 98 | + default is 60s because of jobs that use AWS Glue versions 2.0 and later |
| 99 | + have a 1-minute minimum. |
| 100 | + [AWS Glue Pricing](https://aws.amazon.com/glue/pricing/?nc1=h_ls) |
| 101 | +
|
| 102 | + Example: |
| 103 | + Start a job to AWS Glue Job. |
| 104 | + ```python |
| 105 | + from prefect import flow |
| 106 | + from prefect_aws import AwsCredentials |
| 107 | + from prefect_aws.glue_job import GlueJobBlock |
| 108 | +
|
| 109 | +
|
| 110 | + @flow |
| 111 | + def example_run_glue_job(): |
| 112 | + aws_credentials = AwsCredentials( |
| 113 | + aws_access_key_id="your_access_key_id", |
| 114 | + aws_secret_access_key="your_secret_access_key" |
| 115 | + ) |
| 116 | + glue_job_run = GlueJobBlock( |
| 117 | + job_name="your_glue_job_name", |
| 118 | + arguments={"--YOUR_EXTRA_ARGUMENT": "YOUR_EXTRA_ARGUMENT_VALUE"}, |
| 119 | + ).trigger() |
| 120 | +
|
| 121 | + return glue_job_run.wait_for_completion() |
| 122 | +
|
| 123 | +
|
| 124 | + example_run_glue_job() |
| 125 | + ``` |
| 126 | + """ |
| 127 | + |
| 128 | + job_name: str = Field( |
| 129 | + ..., |
| 130 | + title="AWS Glue Job Name", |
| 131 | + description="The name of the job definition to use.", |
| 132 | + ) |
| 133 | + |
| 134 | + arguments: Optional[dict] = Field( |
| 135 | + default=None, |
| 136 | + title="AWS Glue Job Arguments", |
| 137 | + description="The job arguments associated with this run.", |
| 138 | + ) |
| 139 | + job_watch_poll_interval: float = Field( |
| 140 | + default=60.0, |
| 141 | + description=( |
| 142 | + "The amount of time to wait between AWS API calls while monitoring the " |
| 143 | + "state of an Glue Job." |
| 144 | + ), |
| 145 | + ) |
| 146 | + |
| 147 | + aws_credentials: AwsCredentials = Field( |
| 148 | + title="AWS Credentials", |
| 149 | + default_factory=AwsCredentials, |
| 150 | + description="The AWS credentials to use to connect to Glue.", |
| 151 | + ) |
| 152 | + |
| 153 | + async def trigger(self) -> GlueJobRun: |
| 154 | + """trigger for GlueJobRun""" |
| 155 | + client = self._get_client() |
| 156 | + job_run_id = self._start_job(client) |
| 157 | + return GlueJobRun( |
| 158 | + job_name=self.job_name, |
| 159 | + job_id=job_run_id, |
| 160 | + job_watch_poll_interval=self.job_watch_poll_interval, |
| 161 | + ) |
| 162 | + |
| 163 | + def _start_job(self, client: _GlueJobClient) -> str: |
| 164 | + """ |
| 165 | + Start the AWS Glue Job |
| 166 | + [doc](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue/client/start_job_run.html) |
| 167 | + """ |
| 168 | + self.logger.info( |
| 169 | + f"starting job {self.job_name} with arguments {self.arguments}" |
| 170 | + ) |
| 171 | + try: |
| 172 | + response = client.start_job_run( |
| 173 | + JobName=self.job_name, |
| 174 | + Arguments=self.arguments, |
| 175 | + ) |
| 176 | + job_run_id = str(response["JobRunId"]) |
| 177 | + self.logger.info(f"job started with job run id: {job_run_id}") |
| 178 | + return job_run_id |
| 179 | + except Exception as e: |
| 180 | + self.logger.error(f"failed to start job: {e}") |
| 181 | + raise RuntimeError |
| 182 | + |
| 183 | + def _get_client(self) -> _GlueJobClient: |
| 184 | + """ |
| 185 | + Retrieve a Glue Job Client |
| 186 | + """ |
| 187 | + boto_session = self.aws_credentials.get_boto3_session() |
| 188 | + return boto_session.client("glue") |
0 commit comments