decision tree using python
decision tree using python
import sys
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
def load_df():
df = pd.read_csv("./listings.csv").fillna(0)
return df
def analyze_correlations(df):
"""
Compute and print correlation of numeric features with listing_quality.
"""
corr_cols = [
"price_num",
"host_is_superhost",
"host_listings_count",
"calculated_host_listings_count",
"calculated_host_listings_count_entire_homes",
"calculated_host_listings_count_private_rooms",
"calculated_host_listings_count_shared_rooms",
"review_scores_rating",
"review_scores_accuracy",
"review_scores_cleanliness",
"review_scores_checkin",
"review_scores_communication",
"review_scores_location",
"review_scores_value",
"number_of_reviews",
"reviews_per_month",
"accommodates",
"bathrooms",
"beds",
"room_type_score",
"amenities_count",
"instant_bookable",
"latitude",
"longitude",
"availability_30",
"availability_60",
"availability_90",
"availability_365",
"listing_quality",
]
# Filter only columns that actually exist in dataframe (to avoid KeyErrors)
corr_cols = [col for col in corr_cols if col in df.columns]
# Compute correlation matrix
corr = df[corr_cols].corr()
# Extract only correlations with target
target_corr = corr["listing_quality"].sort_values()
print("\n===== Correlation with listing_quality =====")
print(target_corr)
def preprocess_df(df):
# Process bathrooms
df["bathrooms"] = (
df["bathrooms_text"]
.astype(str)
.str.extract(r"(\d+\.?\d*)")
.astype(float)
.fillna(0)
)
# Instant bookable to numeric
df["instant_bookable"] = df["instant_bookable"].apply(
lambda x: 1 if x == "t" else 0
)
# Count amenities
df["amenities_count"] = (
df["amenities"].astype(str).apply(lambda x: x.count(",") + 1 if x != "0" else 0)
)
# Price numeric
df["price_num"] = (
df["price"]
.astype(str)
.str.replace("$", "", regex=False)
.str.replace(",", "", regex=False)
.astype(float)
)
# Room type scoring
room_type_weights = {
"Entire home/apt": 4.0,
"Private room": 3.0,
"Hotel room": 2.5,
"Shared room": 1.0,
}
df["room_type_score"] = df["room_type"].map(room_type_weights)
# Listing quality target
df["listing_quality"] = (
0.4 * df["host_is_superhost"].map({"t": 1, "f": 0, "0": 0})
+ 0.3
* (pd.to_numeric(df["review_scores_rating"], errors="coerce").fillna(0) / 100)
+ 0.3
* (
1
- (
pd.to_numeric(
df["price"].replace({"[$,]": ""}, regex=True), errors="coerce"
).fillna(0)
/ pd.to_numeric(
df["price"].replace({"[$,]": ""}, regex=True), errors="coerce"
).max()
)
)
)
df["host_is_superhost"] = df["host_is_superhost"].map({"t": 1, "f": 0, "0": 0})
threshold = df["listing_quality"].median()
df["listing_quality"] = df["listing_quality"].apply(
lambda x: 1 if x >= threshold else 0
)
return df
def train_models(df):
input_cols = [
"room_type_score",
"accommodates",
"bathrooms",
"beds",
"amenities_count",
"review_scores_location",
]
target_col = "listing_quality"
X = df[input_cols]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)
# Decision Tree
print("===== Decision Tree =====")
dt_model = DecisionTreeClassifier(
random_state=42, max_depth=5
) # limit depth to prevent overfitting
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))
if __name__ == "__main__":
df = load_df()
df = preprocess_df(df)
analyze_correlations(df)
train_models(df)