Skip to content

speed running a dataframe to tableยค

this is a pure dataframe approach to constructing html representations

  • the shorter the code distance, the easier it is to reason
  • setup your accessors before solving your problem
    • what libraries will you need to use in dataframe chaining?
    • do they improve your ability to reason?
    • encapsulate the complexity
    from nbconvert_a11y.table import new
    df = DataFrame(numpy.random.randn(100, 100))
    visible = 3
    cranges = [Index(range(visible)), Index(range(len(df.columns)-visible, len(df.columns)))]
    rranges = [Index(range(visible)), Index(range(len(df)-visible, len(df)))]

the table representation establishes the concept of row and columns along with their associated indexes. our transformation from a dataframe is translating the nominal tidy data columns and indexes to ordinal table row and column indexes. we do this this in a few groups 1. the thead region, 2. the tbody region, 3. the tfoot region.

our primary library for building the table is beautiful soup. spending time building out custom accessors for the dataframe objects will prove beneficial when building our final table.

@pandas.api.extensions.register_dataframe_accessor("bs4")
@pandas.api.extensions.register_series_accessor("bs4")
@pandas.api.extensions.register_index_accessor("bs4")
@dataclass
class _BeautifulSoup:
    parent = bs4.BeautifulSoup(features="lxml") 
    object: DataFrame | Series | Index
    cast: typing.Callable = None
    GROUPS = "thead", "tbody", "table", "tr", "tfoot"

    @classmethod
    def new(cls, name, *object, **attrs):
        element = cls.parent.new_tag(name=name, attrs=attrs)
        element.extend(object)
        return element

    def _cast(self, object):
        if self.cast is not None: object = self.cast(object)
        if isinstance(object, bs4.Tag): return object
        return str(object)

    def element(self, name, **attrs):
        object = self.object
        if isinstance(object, Index): object = object.to_frame()
        if isinstance(object, DataFrame):
            return object.map(self._cast).map(partial(self.new, name, **attrs))
        elif isinstance(object, Series):
            return object.apply(self._cast).apply(partial(self.new, name, **attrs))
        raise TypeError(F"cant create element for type of {type(self.object)}")

    def elements(self, name, axis=1, **attrs):
        object = self.object
        if isinstance(object, Index): object = object.to_frame()
        if isinstance(object, DataFrame):
            return object.map(self._cast).apply(lambda row: self.new(name, *row, **attrs), axis=axis)
        elif isinstance(object, Series):
            return self.new(name, *object.apply(self._cast), **attrs)
        raise TypeError(F"cant create elements for type of {type(self.object)}")


    for name in "table tbody thead tfoot tr dl ul ol".split(): locals()[name] = functools.partialmethod(elements, name)
    for name in "th td li dt dd".split(): locals()[name] = functools.partialmethod(element, name)
    del name
%%
after setting up our beautiful soup jigs we can no rip through a dataframe that consolidates to an html table.

1. initialize the thead with the names of the indexes ~~or columns~~

        colgroups = [DataFrame([[*df.index.names]]).bs4.th(scope="col")]
        for crange in cranges:
1. add the column names to the thead

            colgroups.append(DataFrame(df.columns[crange].to_frame().T.values, None, crange).bs4.th(scope="col"))

1. initialize the rowgroups with the thead computed

        rowgroups = [pandas.concat(colgroups, axis=1).bs4.tr().bs4.thead()]
        for rrange in rranges:
1. iterate throught the row ranges initializing the column groups with the index names

            colgroups = [DataFrame(df.index[rrange].values, rrange + df.columns.nlevels).bs4.th(scope="row")]
            for crange in cranges:
1. map the dataframe to the oridinal row, group of a table element

                colgroups.append(DataFrame(
                    df.iloc[rrange, crange].values, rrange + df.columns.nlevels, crange + df.index.nlevels
                ).bs4.td())
            rowgroups.append(pandas.concat(colgroups, axis=1).bs4.tr().bs4.tbody())

1. aggregate the statistics of the footer

        colgroups = []
        index = len(df) + df.columns.nlevels + pandas.RangeIndex(2)
        for crange in cranges:
1. the min and max, the extent, enable us to do lots of cool stuff with plotting

            stats = df.iloc[:, crange].agg("min max".split())
            if not colgroups:
1. write the summary index names

                colgroups.append(DataFrame(stats.index.values, index).bs4.th(scope="row"))
1. write the summary values

            colgroups.append(DataFrame(stats.values, index, df.index.nlevels + crange).bs4.td())
1. consolidate the footer

        rowgroups.append(pandas.concat(colgroups, axis=1).bs4.tr().bs4.tfoot())        
1. post process and synthesize the complete table

        table = Series(rowgroups).bs4.table(id="demo")
        table.insert(0, new("caption", "quickest table draw in the pacific northwest"))

{{table}}

after setting up our beautiful soup jigs we can no rip through a dataframe that consolidates to an html table.

  1. initialize the thead with the names of the indexes or columns

     colgroups = [DataFrame([[*df.index.names]]).bs4.th(scope="col")]
     for crange in cranges:
    
  2. add the column names to the thead

         colgroups.append(DataFrame(df.columns[crange].to_frame().T.values, None, crange).bs4.th(scope="col"))
    
  3. initialize the rowgroups with the thead computed

     rowgroups = [pandas.concat(colgroups, axis=1).bs4.tr().bs4.thead()]
     for rrange in rranges:
    
  4. iterate throught the row ranges initializing the column groups with the index names

         colgroups = [DataFrame(df.index[rrange].values, rrange + df.columns.nlevels).bs4.th(scope="row")]
         for crange in cranges:
    
  5. map the dataframe to the oridinal row, group of a table element

             colgroups.append(DataFrame(
                 df.iloc[rrange, crange].values, rrange + df.columns.nlevels, crange + df.index.nlevels
             ).bs4.td())
         rowgroups.append(pandas.concat(colgroups, axis=1).bs4.tr().bs4.tbody())
    
  6. aggregate the statistics of the footer

     colgroups = []
     index = len(df) + df.columns.nlevels + pandas.RangeIndex(2)
     for crange in cranges:
    
  7. the min and max, the extent, enable us to do lots of cool stuff with plotting

         stats = df.iloc[:, crange].agg("min max".split())
         if not colgroups:
    
  8. write the summary index names

             colgroups.append(DataFrame(stats.index.values, index).bs4.th(scope="row"))
    
  9. write the summary values

         colgroups.append(DataFrame(stats.values, index, df.index.nlevels + crange).bs4.td())
    
  10. consolidate the footer

     rowgroups.append(pandas.concat(colgroups, axis=1).bs4.tr().bs4.tfoot())        
    
  11. post process and synthesize the complete table

     table = Series(rowgroups).bs4.table(id="demo")
     table.insert(0, new("caption", "quickest table draw in the pacific northwest"))
    
quickest table draw in the pacific northwest
None012979899
02.2504039209751947-2.2345675848030893-0.61042815601272840.60200688783276151.203908035997032-1.7136577818868608
1-1.5662144896915853-0.4362712027341796-1.1289212836376208-0.88571417817867180.007626107856577465-0.4528001992314824
2-1.61089686953545150.6006798243860567-2.1136251861768622-0.26796646005791991.48077571590237841.1963786319147491
97-0.42059916638771140.38761955592653630.8932022146952874-0.965072282031295-0.32480589021166145-0.31470531603110863
98-0.626345645214089-0.73284308241462661.08497522265435940.51961054179592290.4262418297441081-0.34981111139307625
99-0.383996023977735250.8058392220376006-0.159729141776956740.12116047467294262-1.3871299199063132-1.7465081936039943
min-1.8154838350026143-3.396175726528162-3.902040070737226-2.6311202211632185-2.6190849260835383-2.122234340844054
max2.64483536168898332.388646608604362.1916883214233652.78753397105395262.2404090486536512.0519610479207073