Day 7

python
defaultdict
pandas
Author

Josef Fruehwald

Published

December 7, 2022

Part 1

I think I’ll use python, specifically collections.defaultdict().

from path_dict import PathDict
from collections import defaultdict
import re
import pandas as pd
def default0():
  return 0
filesystem = PathDict({})
dir_sum = defaultdict(default0)
with open("2022-12-7_assets/input.txt", 'r') as f:
  history = f.readlines()
history = [line.strip().split() for line in history]
def path_update(path, x):
  if x == "/":
    path = ["/"]
  elif x == "..":
    _ = path.pop(-1)
  else:
    path += [x]
  return(path)

I think I can skip over trying to build out the directory structure in the PathDict … but will I regret it in part 2

path_state = ["/"]
scanned_files = []

for line in history:
  if line[0] == "$":
    if line[1] == "cd":
      path_state = path_update(path_state, line[2])
  else:
    if re.search(r'\d', line[0]):
      # this point messed me up at first! 
      # There are (obviously?) not unique individual filenames!
      file_name = "/".join(path_state) + f"/line[1]"
      if not file_name in scanned_files:
        # *and* there's not unique path names!
        for idx in range(len(path_state)):
          path_name = "/".join(path_state[0:idx+1])
          dir_sum[path_name] += int(line[0])
        scanned_files.append(path_state+[line[1]])
dir_df = pd.DataFrame(dir_sum, index = [1])

I kind of badly want to pivot over to R via reticulate, but let’s try pandas real quick.

long_dir = pd.melt(dir_df)
long_dir[long_dir["value"] <= 100_000]["value"].sum()
1390824
long_dir
                       variable     value
0                             /  46728267
1                         //cvt  32283214
2                //cvt/bbgsthsd    455345
3                //cvt/chhdjtlw   1351375
4       //cvt/chhdjtlw/mbdrgfzs    164331
..                          ...       ...
164            //zft/zrtm/djfww     30770
165            //zft/zrtm/vhdhn   1481975
166      //zft/zrtm/vhdhn/djfww    961278
167  //zft/zrtm/vhdhn/djfww/zft    100941
168        //zft/zrtm/vhdhn/gqc    156273

[169 rows x 2 columns]

Part 2

total_space = 70_000_000
needed_space = 30_000_000
available_space = total_space - dir_sum["/"]
min_size = needed_space - available_space
long_dir["diff_from_needed"] = long_dir["value"] - min_size
(long_dir[
  long_dir["diff_from_needed"] > 0
  ]
  .sort_values(by = "diff_from_needed")
  .iloc[0]
  ["value"]
)
7490863

Just for fun

library(tidyverse)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(reticulate)
library(ggdark)
library(khroma)
library(showtext)
Loading required package: sysfonts
Loading required package: showtextdb
library(scales)

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor
library(ggbeeswarm)
library(emojifont)

font_add_google(name = "Mountains of Christmas", family = "christmas")
font_add(family = "Noto Emoji", regular = file.path(font_paths()[2], "NotoEmoji-VariableFont_wght.ttf"))

showtext_auto()

theme_set(dark_theme_gray() + 
            theme(title = element_text(family = "christmas", size = 20)))
Inverted geom defaults of fill and color/colour.
To change them back, use invert_geom_defaults().
long_dir = py$dir_df
py$long_dir |>
  mutate(depth = str_count(variable, "/") - 1) |>
  ggplot(aes(factor(depth), value, color = value))+
    #geom_quasirandom(varwidth = TRUE, width = 0.2)+
    geom_beeswarm()+
    stat_summary(fun = median, 
                 geom = "text", 
                 color = "white",
                 label = emoji("floppy_disk"),
                 family = "Noto Emoji",
                 size = 6)+
    scale_color_hawaii(trans = "log10", guide = "none")+
    labs(x = "embedding depth",
         y = "directory size (log scale)",
         title = "Embedding vs Size")+
    scale_y_log10(labels = label_comma())+
    annotation_logticks(color = "white", sides = 'l')

Figure 1: Directory embedding depth and total size