from path_dict import PathDict
from collections import defaultdict
import re
import pandas as pd
Day 7
python
defaultdict
pandas
Part 1
I think I’ll use python, specifically collections.defaultdict()
.
def default0():
return 0
= PathDict({})
filesystem = defaultdict(default0) dir_sum
with open("2022-12-7_assets/input.txt", 'r') as f:
= f.readlines()
history = [line.strip().split() for line in history] history
def path_update(path, x):
if x == "/":
= ["/"]
path elif x == "..":
= path.pop(-1)
_ else:
+= [x]
path return(path)
I think I can skip over trying to build out the directory structure in the PathDict
… but will I regret it in part 2
= ["/"]
path_state = []
scanned_files
for line in history:
if line[0] == "$":
if line[1] == "cd":
= path_update(path_state, line[2])
path_state else:
if re.search(r'\d', line[0]):
# this point messed me up at first!
# There are (obviously?) not unique individual filenames!
= "/".join(path_state) + f"/line[1]"
file_name if not file_name in scanned_files:
# *and* there's not unique path names!
for idx in range(len(path_state)):
= "/".join(path_state[0:idx+1])
path_name += int(line[0])
dir_sum[path_name] +[line[1]]) scanned_files.append(path_state
= pd.DataFrame(dir_sum, index = [1]) dir_df
I kind of badly want to pivot over to R via reticulate, but let’s try pandas real quick.
= pd.melt(dir_df)
long_dir "value"] <= 100_000]["value"].sum() long_dir[long_dir[
1390824
long_dir
variable value
0 / 46728267
1 //cvt 32283214
2 //cvt/bbgsthsd 455345
3 //cvt/chhdjtlw 1351375
4 //cvt/chhdjtlw/mbdrgfzs 164331
.. ... ...
164 //zft/zrtm/djfww 30770
165 //zft/zrtm/vhdhn 1481975
166 //zft/zrtm/vhdhn/djfww 961278
167 //zft/zrtm/vhdhn/djfww/zft 100941
168 //zft/zrtm/vhdhn/gqc 156273
[169 rows x 2 columns]
Part 2
= 70_000_000
total_space = 30_000_000
needed_space = total_space - dir_sum["/"]
available_space = needed_space - available_space min_size
"diff_from_needed"] = long_dir["value"] - min_size
long_dir[
(long_dir["diff_from_needed"] > 0
long_dir[
]= "diff_from_needed")
.sort_values(by 0]
.iloc["value"]
[ )
7490863
Just for fun
library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0 ✔ purrr 0.3.5
✔ tibble 3.1.8 ✔ dplyr 1.0.10
✔ tidyr 1.2.1 ✔ stringr 1.4.1
✔ readr 2.1.3 ✔ forcats 0.5.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(reticulate)
library(ggdark)
library(khroma)
library(showtext)
Loading required package: sysfonts
Loading required package: showtextdb
library(scales)
Attaching package: 'scales'
The following object is masked from 'package:purrr':
discard
The following object is masked from 'package:readr':
col_factor
library(ggbeeswarm)
library(emojifont)
font_add_google(name = "Mountains of Christmas", family = "christmas")
font_add(family = "Noto Emoji", regular = file.path(font_paths()[2], "NotoEmoji-VariableFont_wght.ttf"))
showtext_auto()
theme_set(dark_theme_gray() +
theme(title = element_text(family = "christmas", size = 20)))
Inverted geom defaults of fill and color/colour.
To change them back, use invert_geom_defaults().
= py$dir_df long_dir
$long_dir |>
pymutate(depth = str_count(variable, "/") - 1) |>
ggplot(aes(factor(depth), value, color = value))+
#geom_quasirandom(varwidth = TRUE, width = 0.2)+
geom_beeswarm()+
stat_summary(fun = median,
geom = "text",
color = "white",
label = emoji("floppy_disk"),
family = "Noto Emoji",
size = 6)+
scale_color_hawaii(trans = "log10", guide = "none")+
labs(x = "embedding depth",
y = "directory size (log scale)",
title = "Embedding vs Size")+
scale_y_log10(labels = label_comma())+
annotation_logticks(color = "white", sides = 'l')