adding aria to long or wide tables¤

sometimes tables are large and truncated by default to save screen and disc real estate. when this happens, we break the meaning of the columns/row ordering to assistive technology. we need to supplement the table elements with aria to ensure an accessible experience.

we'll also highlight a persistent ambiguity between visual dataframe indexing and audible dataframe indexing.

    import pandas, bs4, enum, numpy, midgy, functools
    get_ipython().display_formatter.formatters["text/html"].for_type(bs4.BeautifulSoup, str);

/tmp/ipykernel_18771/1227099998.py:1: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466

  import pandas, bs4, enum, numpy, midgy, functools

%%
<style>
:is(.jp-OutputArea-output.jp-RenderedHTMLCommon, .nb-outputs) :is(td,th) {
    border: 1px solid;
}
</style>

get an aria marked up, non-uniform table.

    def get_table(df, ARIA=True, caption=None):
        soup = bs4.BeautifulSoup(features="lxml")
        ROWS, COLS = any(df.index.names), any(df.columns.names)
        WIDE = (df.shape[1] + 1) &gt; pandas.options.display.max_columns
        LONG = (df.shape[0] + 1) &gt; pandas.options.display.max_rows
        col_ranges, row_ranges = get_ranges(df, WIDE, LONG)
        soup.append(table := new("table", 
                                 colcount=row_major_at_cols(df) if ARIA or WIDE else None,
                                 rowcount=row_major_at_rows(df) if ARIA or LONG else None))
        table.append(cap := new("caption", caption))    
        cap.append(get_caption(df))
        get_thead(df, table, col_ranges, WIDE, ARIA, LONG)
        get_tbody(df, table, col_ranges, row_ranges, WIDE, ARIA, LONG)
        return soup

    def get_thead(df, table, col_ranges, WIDE=False, ARIA=False, LONG=False):
        ROWS, COLS = any(df.index.names), any(df.columns.names)
        col_center = col_ranges[1].start - col_ranges[0].stop
        for col_level, col_name in enumerate(df.columns.names):
            table.append(tr := trow(rowindex=col_level+1 if ARIA or LONG and row_part else None))
            if not col_level:
                if ROWS or not COLS:
                    for row_level, row_name in enumerate(df.index.names):
                        tr.append(th := theading(
                            str(row_name) or F"index {row_level}", scope="col", 
                            rowspan=df.columns.nlevels if df.columns.nlevels &gt; 1 else None, 
                            colindex=row_level+1 if ARIA else None))
            if COLS: tr.append(theading(
                str(col_name) or F"level {col_level}", scope="row",
                colindex=df.index.nlevels+1 if ARIA else None))

            for col_part, col_range in enumerate(col_ranges):
                if col_part:
                    tr.append(theading(HIDDEN, colindex=col_index+2+df.index.nlevels + bool(LONG and WIDE) if ARIA else None,
                                       **{"aria-colspan": col_center}))
                for col_index in col_range:
                    col_value = df.columns.get_level_values(col_level)[col_index]
                    tr.append(theading(str(col_value), scope="col", colindex=df.index.nlevels + int(ROWS and COLS) + col_index + 1 if ARIA or WIDE and col_part else None))

one of the confusions that will be encountered is that natural row and column indexing of tidy data frames will be inconsistent with those announced by assistive technology. assistive technology indexes tables starting from

    def row_major_at_rows(df): return df.columns.nlevels + len(df) 
    def row_major_at_cols(df): return df.index.nlevels + int(any(df.columns.names)) + len(df.columns)

we present a potentational solution provide more information in the caption that may alert screen readers to the mismatch.

    def get_caption(df):
        dl = new("dl", role="presentation")
        dl.append(new("dt", "rows")), dl.append(new("dd", str(len(df))))
        dl.append(new("dt", "columns")), dl.append(new("dd", str(len(df.columns))))
        dl.append(new("dt", "indexes:")), dl.append(new("dd", indexes := new("dl", role="presentation")))
        indexes.append(new("dt", "rows")), indexes.append(new("dd", str(df.index.nlevels)))
        indexes.append(new("dt", "columns")), indexes.append(new("dd", str(df.columns.nlevels)))
        return dl

iterate through the rows of data and append them to the table

    def get_tbody(df, table, col_ranges, row_ranges, WIDE=False, ARIA=False, LONG=False):
        ROWS, COLS = any(df.index.names), any(df.columns.names)
        row_center = row_ranges[1].start - row_ranges[0].stop
        col_center = col_ranges[1].start - col_ranges[0].stop
        for row_part, row_range in enumerate(row_ranges):
            if row_part:
                table.append(tr := trow(rowindex=row_index+2+df.columns.nlevels, **{"aria-rowspan": row_center}))
                for row_level in range(df.index.nlevels): tr.append(theading(HIDDEN,colindex=row_level+1))
                if ROWS and COLS: tr.append(tdata(EMPTY, colindex=row_level+2))
                for col_part, col_range in enumerate(col_ranges):
                    if col_part: tr.append(tdata(HIDDEN, colindex=col_index+2+df.index.nlevels+int(ROWS and COLS), **{"aria-rowspan": row_center, "aria-colspan": col_center}),)
                    for col_index in col_range: tr.append(tdata(HIDDEN, colindex=col_index + 1 + df.index.nlevels + int(ROWS and COLS)))
            for row_index in row_range:
                table.append(tr := trow(rowindex=row_index+1+df.columns.nlevels))        
                for row_level in range(df.index.nlevels):
                    tr.append(theading(str(df.index.get_level_values(row_level)[row_index]), colindex=row_level+1 if ARIA else None, scope="row"))
                if ROWS and COLS: tr.append(tdata(EMPTY, colindex=row_level+2))
                for col_part, col_range in enumerate(col_ranges):
                    if col_part: tr.append(tdata(
                        HIDDEN, colindex=col_index+2+ df.index.nlevels + int(ROWS and COLS), **{"aria-colspan": col_center}))
                    for col_index in col_range:
                        tr.append(tdata(str(df.iloc[row_index, col_index]), colindex=col_index + 1 + df.index.nlevels + int(ROWS and COLS)))

pandas.options.display.max_columns, pandas.options.display.max_rows determine the ranges of values that are presented to the visitor. this work implements special logic for spanning multiple rows and columns for truncated data.

    def get_frame_bounds(df, WIDE=False, LONG=False):
        a, b, c, d = len(df.columns), len(df.columns), len(df), len(df)
        if WIDE: a = pandas.options.display.max_columns//2; b -=  a
        if LONG: c = pandas.options.display.max_rows//2; d -= c
        return a, b, c, d
    def get_ranges(df, WIDE=False, LONG=False):
        a, b, c, d = get_frame_bounds(df, WIDE=WIDE, LONG=LONG)
        return (range(a), range(b, df.shape[1])), (range(c), range(d, df.shape[0]))

to provide the proper mark up for large tables we need to be rigorous about use of ARIA: rowindex colindex rowspan colspan.

    def new(tag, 
            string=None,rowindex=None, colindex=None, rowcount=None, colcount=None,rowspan=None,colspan=None,scope=None,
            *, soup=bs4.BeautifulSoup(features="lxml"), **attrs):
        """create a new beautiful soup with table and aria properties"""
        data = locals()
        attrs.update({F"aria-{k}": data.get(k) for k in ["rowindex", "colindex", "rowcount", "colcount"]  if data.get(k)})
        attrs.update({k: data.get(k) for k in ["rowspan", "colspan", "scope"] if data.get(k)})
        tag = soup.new_tag(tag, attrs=attrs)
        if string: tag.append(string)
        return tag
    trow = functools.partial(new, "tr")
    theading = functools.partial(new, "th")
    tdata = functools.partial(new, "td")

sample data

    index = pandas.MultiIndex.from_product([["A", "Z"], ["M", "N", "O"], [1, 2, 3]], names=[*"JKL"])
    (df := pandas.DataFrame(columns=index, index=index).rename_axis(columns=[10, 100, 1000]).head())
    single = df.droplevel((0, 1), 0).droplevel((0, 1), 1).rename_axis(None, axis=1).rename_axis(None, axis=0)
    wide = pandas.concat([pandas.concat([df]*10, axis=1)]*20)
    with (options := pandas.option_context("display.max_rows", 4, "display.max_columns", 4)):
        display(wide)

		10	A		...	Z
		100	M		...	O
		1000	1	2	...	2	3
J	K	L
A	M	1	NaN	NaN	...	NaN	NaN
	M	2	NaN	NaN	...	NaN	NaN
	...	...	...	...	...	...	...
	N	1	NaN	NaN	...	NaN	NaN
	N	2	NaN	NaN	...	NaN	NaN

100 rows × 180 columns

%%
<style>
/**inline flex the dl block display and force it cause jupyter is aggressive.**/
table>caption dl {
    display: inline-flex !important;
    justify-content: flex-start;
}
/** add punctation after each of the descriptors. **/
table>caption dl>dd {
    &::after {content: ", "}; &:last-child::after {content: " "};
}
/**unset some jupyter nonsense**/
dl > dt, dl > dd {
    width: unset !important;
    float: unset !important;
    padding-right: 1rem !important;
}
</style>

HIDDEN and EMPTY are used for visual verification of the technique.

    HIDDEN, EMPTY = "hidden",  "empty"

    with options: display(get_table(wide, "a smaller table representation with hidden rows and columns."))

rows
100
columns
180
indexes:
rows
3
columns
3
J	K	L	10	A	A	hidden	Z	Z
			100	M	M	hidden	O	O
			1000	1	2	hidden	2	3
A	M	1	empty	nan	nan	hidden	nan	nan
A	M	2	empty	nan	nan	hidden	nan	nan
hidden	hidden	hidden	empty	hidden	hidden	hidden	hidden	hidden
A	N	1	empty	nan	nan	hidden	nan	nan
A	N	2	empty	nan	nan	hidden	nan	nan

    with pandas.option_context("display.max_rows", 10, "display.max_columns", 10): display(get_table(wide))

rows
100
columns
180
indexes:
rows
3
columns
3
J	K	L	10	A	A	A	A	A	hidden	Z	Z	Z	Z	Z
			100	M	M	M	N	N	hidden	N	N	O	O	O
			1000	1	2	3	1	2	hidden	2	3	1	2	3
A	M	1	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan
A	M	2	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan
A	M	3	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan
A	N	1	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan
A	N	2	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan
hidden	hidden	hidden	empty	hidden	hidden	hidden	hidden	hidden	hidden	hidden	hidden	hidden	hidden	hidden
A	M	1	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan
A	M	2	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan
A	M	3	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan
A	N	1	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan
A	N	2	empty	nan	nan	nan	nan	nan	hidden	nan	nan	nan	nan	nan

notes/discussion¤

an array be a special case for tables where headings aren't needed. a table with no rows or columns headings will reveal a nice array.

this example is a most complex axis case and does not include grouping.