-
Notifications
You must be signed in to change notification settings - Fork 0
make airflow run the Celery queue #31
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -3,12 +3,11 @@ | |||||||||||||||||||
| from pendulum import datetime | ||||||||||||||||||||
| from celery import Celery | ||||||||||||||||||||
| from github import Auth, Github, GithubException | ||||||||||||||||||||
| from client import get_data_from_queue | ||||||||||||||||||||
| from datetime import timedelta | ||||||||||||||||||||
| import time | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| api_token, api_token_two = os.getenv("GITHUB_API_TOKEN"), os.getenv("GITHUB_API_TOKEN_SECOND_ACCOUNT") | ||||||||||||||||||||
| auth, auth_two = Auth.Token(api_token), Auth.Token(api_token_two) | ||||||||||||||||||||
| gh, gh_two = Github(auth=auth), Github(auth=auth_two) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| app = Celery( | ||||||||||||||||||||
| 'airflow_client', | ||||||||||||||||||||
| broker = os.getenv('CELERY_BROKER_URL'), | ||||||||||||||||||||
|
|
@@ -22,12 +21,16 @@ | |||||||||||||||||||
| description="Run Celery queue with RabbitMQ as the broker \ | ||||||||||||||||||||
| in order to get GitHub data from the GitHub API", | ||||||||||||||||||||
| tags=["celery_queue"], | ||||||||||||||||||||
| max_consecutive_failed_dag_runs=3, | ||||||||||||||||||||
| max_consecutive_failed_dag_runs=3 | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| def run_github_data_queue(): | ||||||||||||||||||||
|
|
||||||||||||||||||||
| @task | ||||||||||||||||||||
| def check_rate_limit(): | ||||||||||||||||||||
| @task(do_xcom_push=True, multiple_outputs=True) | ||||||||||||||||||||
| def check_rate_limit(**context): | ||||||||||||||||||||
| api_token, api_token_two = os.getenv("GITHUB_API_TOKEN"), os.getenv("GITHUB_API_TOKEN_SECOND_ACCOUNT") | ||||||||||||||||||||
| auth, auth_two = Auth.Token(api_token), Auth.Token(api_token_two) | ||||||||||||||||||||
| gh, gh_two = Github(auth=auth), Github(auth=auth_two) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| rate_limit = gh.rate_limiting | ||||||||||||||||||||
| print(f"Rate limit: {rate_limit[0]} remaining / {rate_limit[1]} total") | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
@@ -36,16 +39,29 @@ def check_rate_limit(): | |||||||||||||||||||
| "total": rate_limit[1] | ||||||||||||||||||||
| } | ||||||||||||||||||||
|
|
||||||||||||||||||||
| @task | ||||||||||||||||||||
| def run_queue(**context): | ||||||||||||||||||||
| rate_limit = context["ti"].xcom_pull(task_ids="check_rate_limit", key="remaining") | ||||||||||||||||||||
| max_total_api_calls = context["ti"].xcom_pull(task_ids="check_rate_limit", key="total") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if rate_limit > 100: | ||||||||||||||||||||
| print("IT WORKS") | ||||||||||||||||||||
| print(rate_limit) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| celery_worker = app.send_task("worker.get_github_data") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| print(celery_worker) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| time.sleep(500) | ||||||||||||||||||||
|
Comment on lines
+43
to
+55
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
This task's logic is unclear and inefficient. It should be refactored to properly gate the data fetching pipeline based on the rate limit, perhaps using a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||||||||||||||||||||
|
|
||||||||||||||||||||
| @task | ||||||||||||||||||||
| def run_the_queue(rate_limit: str): | ||||||||||||||||||||
| print(f'rate limit: {rate_limit["total"]}, remaining {rate_limit["remaining"]}') | ||||||||||||||||||||
| def save_data_from_queue(): | ||||||||||||||||||||
|
|
||||||||||||||||||||
| if rate_limit["remaining"] > 4900: | ||||||||||||||||||||
| app.send_task("worker.get_data_from_queue", args=[100, 500]) | ||||||||||||||||||||
| get_data_from_queue() | ||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The called function Here is a suggested fix for # In client.py
def get_data_from_queue():
the_data = None
try:
print("Getting the result")
response = build_repo_chord(total=5000, batch_size=500)
the_data = response.get(timeout=3600)
print(f"Result: {the_data}")
except Exception as e:
print(f"Error: {e}")
if the_data is not None:
return save_to_parquet(the_data)
Comment on lines
57
to
+60
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
|
Comment on lines
+58
to
63
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function definition includes unnecessary blank lines and incorrect indentation, which harms readability. Please remove them.
Suggested change
|
||||||||||||||||||||
| val = check_rate_limit() | ||||||||||||||||||||
| run_the_queue(rate_limit=val) | ||||||||||||||||||||
| check_rate_limit() >> run_queue() >> save_data_from_queue() | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| run_github_data_queue() | ||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,32 @@ | ||
| services: | ||
| api-server: | ||
| networks: | ||
| - etl-shared | ||
| volumes: | ||
| - /Users/luisgonzalez/Documents/code/etl pipeline/github_etl:/usr/local/airflow/project | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A medium-severity vulnerability exists due to a hardcoded absolute path to a local user directory (
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The volume mount in - ./:/usr/local/airflow/project |
||
| environment: | ||
| PYTHONPATH: /usr/local/airflow/project | ||
| networks: [etl-shared] | ||
|
|
||
| dag-processor: | ||
| volumes: | ||
| - /Users/luisgonzalez/Documents/code/etl pipeline/github_etl:/usr/local/airflow/project | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
| environment: | ||
| PYTHONPATH: /usr/local/airflow/project | ||
| networks: [etl-shared] | ||
|
|
||
| scheduler: | ||
| networks: | ||
| - etl-shared | ||
| volumes: | ||
| - /Users/luisgonzalez/Documents/code/etl pipeline/github_etl:/usr/local/airflow/project | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
| environment: | ||
| PYTHONPATH: /usr/local/airflow/project | ||
| networks: [etl-shared] | ||
|
|
||
| triggerer: | ||
| networks: | ||
| - etl-shared | ||
| volumes: | ||
| - /Users/luisgonzalez/Documents/code/etl pipeline/github_etl:/usr/local/airflow/project | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
| environment: | ||
| PYTHONPATH: /usr/local/airflow/project | ||
| networks: [etl-shared] | ||
|
|
||
|
|
||
| networks: | ||
| etl-shared: | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,29 +1,34 @@ | ||||||
| import logging | ||||||
| from pathlib import Path | ||||||
| from datetime import datetime, timezone | ||||||
| from datetime import datetime | ||||||
| from celery.result import AsyncResult | ||||||
| from worker import build_repo_chord | ||||||
| from rb_queue.rabbitmq import consume_repos | ||||||
| import polars as pl | ||||||
|
|
||||||
| DIRECT = Path("/usr/local/airflow/project/data") | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The data directory path is hardcoded. This makes the application less flexible and harder to configure for different environments. It is a best practice to source this kind of configuration from an environment variable, for example by using
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The
Suggested change
|
||||||
|
|
||||||
| def save_to_parquet(the_data): | ||||||
| today = datetime.now().strftime("%Y-%m-%d") | ||||||
|
|
||||||
| if not Path(f"data/{today}/").exists(): | ||||||
| Path(f"data/{today}").mkdir(parents=True, exist_ok=True) | ||||||
| # if not Path(f"data/{today}/").exists(): | ||||||
| # Path(f"data/{today}").mkdir(parents=True, exist_ok=True) | ||||||
|
Comment on lines
+14
to
+15
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Comment on lines
+14
to
+15
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
|
|
||||||
| fi_direct = DIRECT / today | ||||||
| fi_direct.mkdir(parents=True, exist_ok=True) | ||||||
|
|
||||||
| print("This is the else of the client") | ||||||
| print(the_data) | ||||||
|
|
||||||
| df = pl.DataFrame(the_data) | ||||||
| df.write_parquet(f"data/{today}/github_data.parquet", compression="zstd") | ||||||
| df.write_parquet(f"{fi_direct}/github_data.parquet", compression="zstd") | ||||||
| print("Valid Parquet data") | ||||||
|
|
||||||
|
|
||||||
| def get_data_from_queue(): | ||||||
| try: | ||||||
| print("Getting the result") | ||||||
|
|
||||||
| response = build_repo_chord(total=5000, batch_size=500) | ||||||
| the_data = response.get(timeout=3600) # 1 hour timeout | ||||||
| print(f"Result: {the_data}") | ||||||
|
|
@@ -36,3 +41,4 @@ def get_data_from_queue(): | |||||
|
|
||||||
| if __name__ == "__main__": | ||||||
| get_data_from_queue() | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The GitHub client
gh_twoand its related authentication variables (api_token_two,auth_two) are initialized but never used within this task. This adds unnecessary overhead and should be removed to improve code clarity.