decision tree using python

decision tree using python

import sys
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


def load_df():
    df = pd.read_csv("./listings.csv").fillna(0)
    return df


def analyze_correlations(df):
    """
    Compute and print correlation of numeric features with listing_quality.
    """

    corr_cols = [
        "price_num",
        "host_is_superhost",
        "host_listings_count",
        "calculated_host_listings_count",
        "calculated_host_listings_count_entire_homes",
        "calculated_host_listings_count_private_rooms",
        "calculated_host_listings_count_shared_rooms",
        "review_scores_rating",
        "review_scores_accuracy",
        "review_scores_cleanliness",
        "review_scores_checkin",
        "review_scores_communication",
        "review_scores_location",
        "review_scores_value",
        "number_of_reviews",
        "reviews_per_month",
        "accommodates",
        "bathrooms",
        "beds",
        "room_type_score",
        "amenities_count",
        "instant_bookable",
        "latitude",
        "longitude",
        "availability_30",
        "availability_60",
        "availability_90",
        "availability_365",
        "listing_quality",
    ]

    # Filter only columns that actually exist in dataframe (to avoid KeyErrors)
    corr_cols = [col for col in corr_cols if col in df.columns]

    # Compute correlation matrix
    corr = df[corr_cols].corr()

    # Extract only correlations with target
    target_corr = corr["listing_quality"].sort_values()

    print("\n===== Correlation with listing_quality =====")
    print(target_corr)


def preprocess_df(df):
    # Process bathrooms
    df["bathrooms"] = (
        df["bathrooms_text"]
        .astype(str)
        .str.extract(r"(\d+\.?\d*)")
        .astype(float)
        .fillna(0)
    )

    # Instant bookable to numeric
    df["instant_bookable"] = df["instant_bookable"].apply(
        lambda x: 1 if x == "t" else 0
    )

    # Count amenities
    df["amenities_count"] = (
        df["amenities"].astype(str).apply(lambda x: x.count(",") + 1 if x != "0" else 0)
    )

    # Price numeric
    df["price_num"] = (
        df["price"]
        .astype(str)
        .str.replace("$", "", regex=False)
        .str.replace(",", "", regex=False)
        .astype(float)
    )

    # Room type scoring
    room_type_weights = {
        "Entire home/apt": 4.0,
        "Private room": 3.0,
        "Hotel room": 2.5,
        "Shared room": 1.0,
    }
    df["room_type_score"] = df["room_type"].map(room_type_weights)

    # Listing quality target
    df["listing_quality"] = (
        0.4 * df["host_is_superhost"].map({"t": 1, "f": 0, "0": 0})
        + 0.3
        * (pd.to_numeric(df["review_scores_rating"], errors="coerce").fillna(0) / 100)
        + 0.3
        * (
            1
            - (
                pd.to_numeric(
                    df["price"].replace({"[$,]": ""}, regex=True), errors="coerce"
                ).fillna(0)
                / pd.to_numeric(
                    df["price"].replace({"[$,]": ""}, regex=True), errors="coerce"
                ).max()
            )
        )
    )
    df["host_is_superhost"] = df["host_is_superhost"].map({"t": 1, "f": 0, "0": 0})
    threshold = df["listing_quality"].median()
    df["listing_quality"] = df["listing_quality"].apply(
        lambda x: 1 if x >= threshold else 0
    )

    return df


def train_models(df):
    input_cols = [
        "room_type_score",
        "accommodates",
        "bathrooms",
        "beds",
        "amenities_count",
        "review_scores_location",
    ]

    target_col = "listing_quality"

    X = df[input_cols]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42
    )

    # Decision Tree
    print("===== Decision Tree =====")
    dt_model = DecisionTreeClassifier(
        random_state=42, max_depth=5
    )  # limit depth to prevent overfitting
    dt_model.fit(X_train, y_train)
    y_pred_dt = dt_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred_dt))
    print("Classification Report:\n", classification_report(y_test, y_pred_dt))


if __name__ == "__main__":
    df = load_df()
    df = preprocess_df(df)
    analyze_correlations(df)
    train_models(df)