Skip to content

David #2

Merged
merged 2 commits into from
Apr 10, 2026
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 49 additions & 3 deletions hybrid/baseline_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from datetime import timedelta


def popularity_split_baseline(behaviors_df, articles_df, train_slice=100000, window_size=10000):
def popularity_split_baseline(behaviors_df, articles_df, train_slice=100_000, window_size=100_00):
behaviors_df = behaviors_df.sort_values('impression_time').reset_index(drop=True)

train_df = behaviors_df.iloc[:train_slice]
Expand Down Expand Up @@ -37,10 +37,9 @@ def popularity_split_baseline(behaviors_df, articles_df, train_slice=100000, win
def popularity_baseline(behaviors_df, articles_df):
total_hr20 = 0
num_clicks = 0

behaviors_df = behaviors_df.sort_values('impression_time')

batch_size = 10000
batch_size = 10_000
for i in range(0, len(behaviors_df), batch_size):
chunk = behaviors_df.iloc[i : i + batch_size]
current_time = chunk['impression_time'].max()
Expand Down Expand Up @@ -185,3 +184,50 @@ def to_list(x):
evaluate_ever_clicked_hr(baseline_recs, behaviors.iloc[100_000 : ])
check_article_stats(baseline_recs, articles)
evaluate_detailed_hits(baseline_recs, behaviors.iloc[100_000 : ])



def popularity_baseline_hybrid(
behaviors_df: pd.DataFrame,
articles_df: pd.DataFrame,
top_k: int = 20,
) -> dict:
"""
Popularity baseline using the inview candidate set per impression.
Ranks inview articles by their total_pageviews, recommends the top-K,
and measures HR@K against clicked articles.
Falls back to random ranking for articles with null pageviews.
"""
if behaviors_df.empty:
return {"hr_at_k": 0.0, "hits": 0, "impressions": 0, "top_k": top_k}

# Build a pageview lookup; fill nulls with 0 so they rank last
pageview_lookup = (
articles_df["total_pageviews"]
.fillna(0)
.astype(int)
.to_dict()
)

total_hits = 0
total_impressions = 0

for _, row in behaviors_df.iterrows():
inview = row["article_ids_inview"]
clicked = row["article_ids_clicked"]

if not inview.any() or not clicked.any():
continue

# Rank inview articles by global popularity
ranked = sorted(inview, key=lambda aid: pageview_lookup.get(aid, 0), reverse=True)
recommended = ranked[:top_k]

if any(aid in recommended for aid in clicked):
total_hits += 1
total_impressions += 1

hr = total_hits / total_impressions if total_impressions > 0 else 0.0
print(f"HR@{top_k}: {hr:.4%} ({total_hits}/{total_impressions} impressions)")
return {"hr_at_k": hr, "hits": total_hits, "impressions": total_impressions, "top_k": top_k}
6 changes: 3 additions & 3 deletions hybrid/hybrid.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "0051c91c",
"metadata": {},
"outputs": [],
"source": [
"def read_data():\n",
" basedir = \"../data/raw/train/\"\n",
" basedir = \"../data/raw/\"\n",
" print(\"Loading data...\")\n",
" behaviors = pd.read_parquet(basedir+'behaviors.parquet')\n",
" behaviors = pd.read_parquet(basedir+'train/behaviors.parquet')\n",
" articles = pd.read_parquet(basedir+'articles.parquet').set_index('article_id')\n",
" behaviors['impression_time'] = pd.to_datetime(behaviors['impression_time'])\n",
" articles['published_time'] = pd.to_datetime(articles['published_time'])\n",
Expand Down
24 changes: 24 additions & 0 deletions hybrid/hybrid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import polars as pl
import pandas as pd


from baseline_script import popularity_baseline_hybrid
from content import *
from collaborative import *


# Read the behaviors and articles data
def read_data_hybrid(start=None,length=None) -> tuple[pd.DataFrame,pd.DataFrame]:
basedir = "./data/raw/"
print("Loading data...")
if start is None or length is None:
raise ValueError
behaviors = pl.scan_parquet(basedir+'train/behaviors.parquet').slice(start,length).collect().to_pandas()
articles = pl.read_parquet(basedir+'articles.parquet').to_pandas()
return behaviors, articles


if __name__ == "__main__":
behaviors, articles = read_data_hybrid(2_000_000,1_000_000)
# Creating the baseline
popularity_baseline_hybrid(behaviors,articles, top_k=5)
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ dependencies = [
"stop-words>=2025.11.4",
"jupyter>=1.1.1",
"ipykernel>=6.29.0",
"tqdm>=4.66.0",
]
171 changes: 0 additions & 171 deletions scripts/david/cf.ipynb

This file was deleted.

Loading