Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions openshift_metrics/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,26 @@
S3_SECRET_ACCESS_KEY = os.getenv("S3_OUTPUT_SECRET_ACCESS_KEY")
S3_INVOICE_BUCKET = os.getenv("S3_INVOICE_BUCKET", "nerc-invoicing")
S3_METRICS_BUCKET = os.getenv("S3_METRICS_BUCKET", "openshift_metrics")

# Billing Configuration
# Service Unit Names for Rate Lookup from nerc-rates
SU_NAMES = ["GPUV100", "GPUA100", "GPUA100SXM4", "GPUH100", "CPU"]
RESOURCE_NAMES = ["vCPUs", "RAM", "GPUs"]

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are not configurations for the script. I would suggest leave them where they were in get_su_definitions for now. As mentioned by @naved001, we could move this information to a file at some point in the future.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, exactly. Let's not touch these now for now. The whole SU types and their names will require some more thought.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jimmysway This comment is not addressed. These constants are not accessed anywhere in your PR


# Rate Configuration
USE_NERC_RATES_ENV = os.getenv("USE_NERC_RATES")
USE_NERC_RATES = USE_NERC_RATES_ENV.lower() == "true" if USE_NERC_RATES_ENV else None

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest just having USE_NERC_RATES and removing USE_NERC_RATES_ENV. If the user does not set USE_NERC_RATES, that should be equivalent to setting the env var to false:

Suggested change
USE_NERC_RATES_ENV = os.getenv("USE_NERC_RATES")
USE_NERC_RATES = USE_NERC_RATES_ENV.lower() == "true" if USE_NERC_RATES_ENV else None
USE_NERC_RATES = os.getenv("USE_NERC_RATES", "false").lower() == "true"

Do you have a specific reason for why these two env vars needs to be defined seperately?


# Individual rates (used when USE_NERC_RATES=false)
RATE_CPU_SU = os.getenv("RATE_CPU_SU")
RATE_GPU_V100_SU = os.getenv("RATE_GPU_V100_SU")
RATE_GPU_A100SXM4_SU = os.getenv("RATE_GPU_A100SXM4_SU")
RATE_GPU_A100_SU = os.getenv("RATE_GPU_A100_SU")
RATE_GPU_H100_SU = os.getenv("RATE_GPU_H100_SU")

# Legacy rate (for backward compatibility)
GPU_A100_RATE = os.getenv("GPU_A100_RATE")

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still don't see where this constant is used


# Prometheus Query Configuration
PROM_QUERY_INTERVAL_MINUTES = int(os.getenv("PROM_QUERY_INTERVAL_MINUTES", 15))
assert PROM_QUERY_INTERVAL_MINUTES >= 1, "Query interval must be at least 1 minute"
68 changes: 47 additions & 21 deletions openshift_metrics/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,16 @@

from openshift_metrics import utils, invoice
from openshift_metrics.metrics_processor import MetricsProcessor
from openshift_metrics.config import S3_INVOICE_BUCKET, PROM_QUERY_INTERVAL_MINUTES
from openshift_metrics.config import (
S3_INVOICE_BUCKET,
USE_NERC_RATES,
RATE_CPU_SU,
RATE_GPU_V100_SU,
RATE_GPU_A100SXM4_SU,
RATE_GPU_A100_SU,
RATE_GPU_H100_SU,
PROM_QUERY_INTERVAL_MINUTES,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -90,16 +99,6 @@ def main():
nargs="*",
help="List of timestamp ranges in UTC to ignore in the format 'YYYY-MM-DDTHH:MM:SS,YYYY-MM-DDTHH:MM:SS'",
)
parser.add_argument(
"--use-nerc-rates",
action="store_true",
help="Use rates from the nerc-rates repo",
)
parser.add_argument("--rate-cpu-su", type=Decimal)
parser.add_argument("--rate-gpu-v100-su", type=Decimal)
parser.add_argument("--rate-gpu-a100sxm4-su", type=Decimal)
parser.add_argument("--rate-gpu-a100-su", type=Decimal)
parser.add_argument("--rate-gpu-h100-su", type=Decimal)

args = parser.parse_args()
files = args.files
Expand Down Expand Up @@ -165,7 +164,12 @@ def main():
datetime.strptime(report_start_date, "%Y-%m-%d"), "%Y-%m"
)

if args.use_nerc_rates:
if USE_NERC_RATES is None:
raise ValueError(
"USE_NERC_RATES environment variable must be set to 'true' or 'false'"
)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Following from my comment above, this if clause should also be removed. I think USE_NERC_RATES not being set should be interpreted as False


if USE_NERC_RATES:
logger.info("Using nerc rates for rates and outages")
rates_data = rates.load_from_url()
invoice_rates = invoice.Rates(
Expand All @@ -178,16 +182,29 @@ def main():
gpu_h100=rates_data.get_value_at("GPUH100 SU Rate", report_month, Decimal),
)
outage_data = outages.load_from_url()
report_start_date_dt = datetime.strptime(report_start_date, "%Y-%m-%d")
report_end_date_dt = datetime.strptime(report_end_date, "%Y-%m-%d")
ignore_hours = outage_data.get_outages_during(
report_start_date, report_end_date, cluster_name
report_start_date_dt, report_end_date_dt, cluster_name

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current upstream code runs fine with report_start_date and report_end_date. Is there a reason you're converting them to datetime objects here?

)
else:
if RATE_CPU_SU is None:
raise ValueError("RATE_CPU_SU environment variable must be set")
if RATE_GPU_V100_SU is None:
raise ValueError("RATE_GPU_V100_SU environment variable must be set")
if RATE_GPU_A100SXM4_SU is None:
raise ValueError("RATE_GPU_A100SXM4_SU environment variable must be set")
if RATE_GPU_A100_SU is None:
raise ValueError("RATE_GPU_A100_SU environment variable must be set")
if RATE_GPU_H100_SU is None:
raise ValueError("RATE_GPU_H100_SU environment variable must be set")

Comment on lines +184 to +194

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the upstream version of the code, an error is already raised by the Rates dataclass if the SU rates are None. I suppose these a bit more informative, but I would argue they are not entirely necessary and leads to more maintenance burden down the line.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just tapping back into this issue sorry about the delay. I don't see where the Rates dataclass validates? But yeah all the other comments I think I've addressed

invoice_rates = invoice.Rates(
cpu=Decimal(args.rate_cpu_su),
gpu_a100=Decimal(args.rate_gpu_a100_su),
gpu_a100sxm4=Decimal(args.rate_gpu_a100sxm4_su),
gpu_v100=Decimal(args.rate_gpu_v100_su),
gpu_h100=Decimal(args.rate_gpu_h100_su),
cpu=Decimal(RATE_CPU_SU),
gpu_a100=Decimal(RATE_GPU_A100_SU),
gpu_a100sxm4=Decimal(RATE_GPU_A100SXM4_SU),
gpu_v100=Decimal(RATE_GPU_V100_SU),
gpu_h100=Decimal(RATE_GPU_H100_SU),
)
ignore_hours = args.ignore_hours

Expand All @@ -210,15 +227,24 @@ def main():
else:
pod_report_file = f"Pod NERC OpenShift {report_month}.csv"

report_start_date = datetime.strptime(report_start_date, "%Y-%m-%d").replace(
report_start_date_dt = datetime.strptime(report_start_date, "%Y-%m-%d").replace(
tzinfo=UTC
)
report_end_date_dt = datetime.strptime(report_end_date, "%Y-%m-%d").replace(
Comment on lines +223 to +226

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want to indicate that report_start_date and report_end_date are datetime objects, I would suggest using type annotation instead, which I think would be simpler to interpret than having to rename references and assignements.

Suggested change
report_start_date_dt = datetime.strptime(report_start_date, "%Y-%m-%d").replace(
tzinfo=UTC
)
report_end_date_dt = datetime.strptime(report_end_date, "%Y-%m-%d").replace(
report_start_date : datetime = datetime.strptime(report_start_date, "%Y-%m-%d").replace(
tzinfo=UTC
)
report_end_date : datetime = datetime.strptime(report_end_date, "%Y-%m-%d").replace(
tzinfo=UTC
)

Is there another reason why you renamed these variables?

tzinfo=UTC
)
report_end_date = datetime.strptime(report_end_date, "%Y-%m-%d").replace(tzinfo=UTC)

if report_start_date_dt.month != report_end_date_dt.month:
logger.warning("The report spans multiple months")
report_month += " to " + datetime.strftime(report_end_date_dt, "%Y-%m")

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This if clause is no longer needed. It will cause an error with the call to nerc-rates in get_su_definitions


logger.info(
f"Generating report from {report_start_date} to {report_end_date + timedelta(days=1)} for {cluster_name}"
f"Generating report from {report_start_date_dt} to {report_end_date_dt + timedelta(days=1)} for {cluster_name}"
)

report_start_date = report_start_date_dt
report_end_date = report_end_date_dt

condensed_metrics_dict = processor.condense_metrics(
["cpu_request", "memory_request", "gpu_request", "gpu_type"]
)
Expand Down