diff --git a/Papers/Paper/apa7.csl b/Papers/Paper/apa7.csl
new file mode 100644
index 0000000..09f4635
--- /dev/null
+++ b/Papers/Paper/apa7.csl
@@ -0,0 +1,1900 @@
+
+
diff --git a/Papers/Paper/cgloss.sty b/Papers/Paper/cgloss.sty
new file mode 100644
index 0000000..d2cbfb8
--- /dev/null
+++ b/Papers/Paper/cgloss.sty
@@ -0,0 +1,204 @@
+% -*- LaTeX -*-
+
+% Modified version of cgloss4e.sty. Hacked and renamed cgloss.sty
+% by Alexis Dimitriadis (alexis@babel.ling.upenn.edu)
+
+% Following borrows from Covington's style files inspired by Midnight by M.
+% de Groot, adapted to be used with gb4e.sty: examples beginning with \ex can
+% contain glosses directly. Default is
+% Linguistic Inquiry style with all lines in \rm; to change a line (eg. to
+% \it for a particular journal, change the appropriate line: e.g.,
+% \let\eachwordone=\rm in a copy of this file. Note that it will NOT work
+% to put \it before the line as the words are parsed separately.
+
+% Use \singlegloss to force single-spaced glosses even in double-space
+% environments. Works also in footnotes (^M as delimiter replaced by
+% \\)---hpk
+%
+
+% Changes by Alexis Dimitriadis
+%
+% Removed flushleft environment and initial vskip to make macros usable
+% on the same line with earlier and/or later text, e.g.,
+% \item Q: \gll ... \\
+% ... \\ \hfill (Greek)
+% \trans Q: `...'
+% Note: Text following the gloss will appear on line one of the example;
+% To get a line break, insert one manually with \\ or use \trans or \glt.
+% (\gln does not end the line, since it was already the
+% case in cgloss4e that a translation must be ended with a line break).
+%
+% Modified \glt to keep translation on the same page as the text.
+
+% BUGS: does not work very gracefully with double spacing (struts are not
+% automatically inserted after the \vboxes used by the macros).
+% In double space mode without \nosinglegloss, when a translation
+% is not given, the next line of text is single-spaced from the glossed text.
+% With \nosinglegloss, the translation is set too close to the bottom line.
+
+
+%%%
+%%% Sentences with word-by-word glosses
+%%%
+
+% See covingtn.tex for full documentation. Some examples:
+%
+% Displayed sentence with gloss and translation:
+%
+% \gll Dit is een Nederlands voorbeeld.\\
+% This is a Dutch example.\\
+% \glt `This is an example in Dutch.'
+%
+% Same, using bracketing where words do not correspond one-to-one:
+%
+% \gll Dit is een voorbeeldje in het Nederlands.\\
+% This is a {little example} in {} Dutch.\\
+% \glt `This is a little example in Dutch.'
+%
+% If you want to align 3 lines rather than two, use \glll instead of \gll.
+%
+% Layout is critical between \gll (or \glll) and \glt (or \gln).
+%
+% Thanks to Marcel R. van der Goot for permission to reproduce code.
+\let\@gsingle=1
+\def\singlegloss{\let\@gsingle=1}
+\def\nosinglegloss{\let\@gsingle=0}
+\@ifundefined{new@fontshape}%
+ {\def\@selfnt{\ifx\@currsize\normalsize\@normalsize\else\@currsize\fi}}
+ {\def\@selfnt{\selectfont}}
+
+\def\gll% % Introduces 2-line text-and-gloss.
+ {\bgroup %\begin{flushleft}
+ \ifx\@gsingle1% conditionally force single spacing (hpk/MC)
+ \def\baselinestretch{1}\@selfnt\fi
+% \vskip\baselineskip\def\baselinestretch{1}%
+% \@selfnt\vskip-\baselineskip\fi%
+ \bgroup
+ \twosent
+ }
+
+\def\glll% % Introduces 3-line text-and-gloss.
+ {\bgroup %\begin{flushleft}
+ \ifx\@gsingle1% conditionally force single spacing (hpk/MC)
+ \def\baselinestretch{1}\@selfnt\fi
+% \vskip\baselineskip\def\baselinestretch{1}%
+% \@selfnt\vskip-\baselineskip\fi%
+ \bgroup
+ \threesent
+ }
+
+% \def\glt{\vskip.0\baselineskip}
+
+% redefine \gltoffset to set off translation from ex and gloss
+\@ifundefined{gltoffset}{\def\gltoffset{0pt}}{}
+
+\def\glt{\ifhmode\\*[\gltoffset]\else\nobreak\vskip\gltoffset\nobreak\fi}
+
+
+% Introduces a translation
+\let\trans\glt
+
+\def\gln{\relax}
+\def\glend{} % obsolete
+ % Ends the gloss environment.
+
+% The following TeX code is adapted, with permission, from:
+% gloss.tex: Macros for vertically aligning words in consecutive sentences.
+% Version: 1.0 release: 26 November 1990
+% Copyright (c) 1991 Marcel R. van der Goot (marcel@cs.caltech.edu).
+% Original Midnight/gloss.tex and Midnight/gloss.doc are available from
+% csvax.cs.caltech.edu [131.215.131.131] in pub/tex
+% and many other anonymous ftp archives.
+
+\newbox\lineone % boxes with words from first line
+\newbox\linetwo
+\newbox\linethree
+\newbox\wordone % a word from the first line (hbox)
+\newbox\wordtwo
+\newbox\wordthree
+\newbox\gline % the constructed double line (hbox)
+\newskip\glossglue % extra glue between glossed pairs or triples
+\glossglue = 0pt plus 2pt minus 1pt % allow stretch/shrink between words
+%\glossglue = 5pt plus 2pt minus 1pt % allow stretch/shrink between words
+\newif\ifnotdone
+
+\@ifundefined{eachwordone}{\let\eachwordone=\rm}{\relax}
+\@ifundefined{eachwordtwo}{\let\eachwordtwo=\rm}{\relax}
+\@ifundefined{eachwordthree}{\let\eachwordthree=\rm}{\relax}
+
+\def\lastword#1#2#3% #1 = \each, #2 = line box, #3 = word box
+ {\setbox#2=\vbox{\unvbox#2%
+ \global\setbox#3=\lastbox
+ }%
+ \ifvoid#3\global\setbox#3=\hbox{#1\strut{} }\fi
+ % extra space following \strut in case #1 needs a space
+ }
+
+\def\testdone
+ {\ifdim\ht\lineone=0pt
+ \ifdim\ht\linetwo=0pt \notdonefalse % tricky space after pt
+ \else\notdonetrue
+ \fi
+ \else\notdonetrue
+ \fi
+ }
+
+\gdef\getwords(#1,#2)#3 #4\\% #1=linebox, #2=\each, #3=1st word, #4=remainder
+ {\setbox#1=\vbox{\hbox{#2\strut#3 }% adds space
+ \unvbox#1%
+ }%
+ \def\more{#4}%
+ \ifx\more\empty\let\more=\donewords
+ \else\let\more=\getwords
+ \fi
+ \more(#1,#2)#4\\%
+ }
+
+\gdef\donewords(#1,#2)\\{}%
+
+\gdef\twosent#1\\ #2\\{% #1 = first line, #2 = second line
+ \getwords(\lineone,\eachwordone)#1 \\%
+ \getwords(\linetwo,\eachwordtwo)#2 \\%
+ \loop\lastword{\eachwordone}{\lineone}{\wordone}%
+ \lastword{\eachwordtwo}{\linetwo}{\wordtwo}%
+ \global\setbox\gline=\hbox{\unhbox\gline
+ \hskip\glossglue
+ \vtop{\box\wordone % vtop was vbox
+ \nointerlineskip
+ \box\wordtwo
+ }%
+ }%
+ \testdone
+ \ifnotdone
+ \repeat
+ \egroup % matches \bgroup in \gloss
+ \gl@stop}
+
+\gdef\threesent#1\\ #2\\ #3\\{% #1 = first line, #2 = second line, #3 = third
+ \getwords(\lineone,\eachwordone)#1 \\%
+ \getwords(\linetwo,\eachwordtwo)#2 \\%
+ \getwords(\linethree,\eachwordthree)#3 \\%
+ \loop\lastword{\eachwordone}{\lineone}{\wordone}%
+ \lastword{\eachwordtwo}{\linetwo}{\wordtwo}%
+ \lastword{\eachwordthree}{\linethree}{\wordthree}%
+ \global\setbox\gline=\hbox{\unhbox\gline
+ \hskip\glossglue
+ \vtop{\box\wordone % vtop was vbox
+ \nointerlineskip
+ \box\wordtwo
+ \nointerlineskip
+ \box\wordthree
+ }%
+ }%
+ \testdone
+ \ifnotdone
+ \repeat
+ \egroup % matches \bgroup in \gloss
+ \gl@stop}
+
+%\def\gl@stop{{\hskip -\glossglue}\unhbox\gline\end{flushleft}}
+
+% \leavevmode puts us back in horizontal mode, so that a \\ will work
+\def\gl@stop{{\hskip -\glossglue}\unhbox\gline\leavevmode \egroup}
+
+\endinput
diff --git a/Papers/Paper/glossa.cls b/Papers/Paper/glossa.cls
new file mode 100644
index 0000000..a6fe123
--- /dev/null
+++ b/Papers/Paper/glossa.cls
@@ -0,0 +1,633 @@
+% Glossa stylefile, modified from the
+% Semantics & Pragmatics style file.
+% Kai von Fintel, Christopher Potts, and Chung-chieh Shan
+% modifications for Glossa by Guido Vanden Wyngaerd
+% v1 13 Nov 2015
+% v2 10 Jan 2016
+% v3 16 Apr 2016
+% v4 26 Jun 2016
+% v5 16 Aug 2016
+% v6 29 Sep 2016
+% v7 27 Jan 2018 patches by Adam Liter for section headings
+% v8 16 May 2019 patches by GS for compatibility with xe/pdflatex
+% v- 19 Jun 2020 by mitcho to better match Glossa typesetting
+% v10 5 May 2021 changes to the stylesheet (no italics for subsections))
+
+\NeedsTeXFormat{LaTeX2e}[1994/06/01]
+\ProvidesClass{glossa}[2018/01/27 v.2.3 Class for Glossa]
+
+% OUTLINE OF THIS CLASS FILE
+% option declarations
+% required packages
+% metadata
+% page dimensions
+% title
+% running headers
+% frontmatter
+% sectioning
+% footnotes
+% backmatter
+% other environments
+% useful macros
+
+%=====================================================================
+%======================== option declarations ========================
+
+\newif\if@lucida\@lucidafalse
+\newif\if@cm\@cmtrue
+\newif\if@times\@timestrue
+\newif\if@brill\@brillfalse
+\newif\if@charis\@charisfalse
+\newif\if@final\@finalfalse
+\newif\if@biblatex\@biblatexfalse
+\newif\if@linguex\@linguexfalse
+
+\DeclareOption*{\PassOptionsToClass{\CurrentOption}{article}}
+
+\newcommand{\@sizeoption@err}{\ClassError{sp}
+ {Cannot use size option \CurrentOption}
+ {Glossa style requires (and automatically loads) 11pt text}}
+
+\DeclareOption{10pt}{\@sizeoption@err}
+\DeclareOption{12pt}{\@sizeoption@err}
+
+\DeclareOption{lucida}{\@lucidatrue \@timesfalse \@cmfalse \@brillfalse \@charisfalse}
+\DeclareOption{times}{\@lucidafalse \@timestrue \@cmfalse \@brillfalse \@charisfalse}
+\DeclareOption{cm}{\@lucidafalse \@timesfalse \@cmtrue \@brillfalse \@charisfalse}
+\DeclareOption{brill}{\@lucidafalse \@timesfalse \@cmtrue \@brilltrue \@charisfalse}
+\DeclareOption{charis}{\@lucidafalse \@timesfalse \@cmtrue \@brillfalse \@charistrue}
+\DeclareOption{final}{\@finaltrue}
+\DeclareOption{biblatex}{\@biblatextrue}
+\DeclareOption{linguex}{\@linguextrue}
+
+\ExecuteOptions{times} % times is the default.
+\ProcessOptions\relax
+\LoadClass[11pt,twoside]{article}
+
+\if@lucida
+ \IfFileExists{lucimatx.sty}{%
+ \RequirePackage[romanfamily=bright-osf, scale=0.9, stdmathdigits=true]{lucimatx}%
+ \linespread{1.05}%
+ \DeclareMathDelimiter{\llbracket}
+ {\mathopen}{letters}{130}{largesymbols}{130}
+ \DeclareMathDelimiter{\rrbracket}
+ {\mathclose}{letters}{131}{largesymbols}{131}
+ \normalfont\DeclareTextCommand
+ \textbullet\encodingdefault{\UseTextSymbol{OMS}\textbullet}
+ \let\nLeftrightarrow\undefined
+ \DeclareMathSymbol{\nLeftrightarrow}{\mathrel}{arrows}{105}
+}{\ClassWarning{glossa.cls}{Requested fonts not present}}%
+\else\relax
+\fi
+%
+\if@times
+ \RequirePackage[T1]{fontenc}% use T1 font encoding
+ \IfFileExists{mathptmx.sty}{\RequirePackage{mathptmx}}{}
+ \IfFileExists{stmaryrd.sty}%
+ {\RequirePackage{stmaryrd}}%
+ {\newcommand{\llbracket}{\ensuremath{\left [\!\left [}}%
+ \newcommand{\rrbracket}{\ensuremath{\right ]\!\right ]}}}
+ \RequirePackage{textcomp}
+ \RequirePackage{amssymb}
+ \else\relax
+\fi
+%
+\if@cm
+\IfFileExists{stmaryrd.sty}%
+ {\RequirePackage{stmaryrd}}%
+ {\newcommand{\llbracket}{\ensuremath{\left [\!\left [}}%
+ \newcommand{\rrbracket}{\ensuremath{\right ]\!\right ]}}}
+ \RequirePackage{amssymb}
+ \RequirePackage{textcomp}
+ \else\relax
+\fi
+
+\if@brill
+\IfFileExists{stmaryrd.sty}%
+ {\RequirePackage{stmaryrd}}%
+ {\newcommand{\llbracket}{\ensuremath{\left [\!\left [}}%
+ \newcommand{\rrbracket}{\ensuremath{\right ]\!\right ]}}}
+ \RequirePackage[no-math]{fontspec}
+ \setmainfont[RawFeature=+tnum]{Brill} %RawFeature ensures proper alignment of examples with linguex
+ \RequirePackage{amssymb}
+ \RequirePackage{textcomp}
+ \else\relax
+\fi
+
+\if@charis
+\IfFileExists{stmaryrd.sty}%
+ {\RequirePackage{stmaryrd}}%
+ {\newcommand{\llbracket}{\ensuremath{\left [\!\left [}}%
+ \newcommand{\rrbracket}{\ensuremath{\right ]\!\right ]}}}
+ \RequirePackage[bitstream-charter]{mathdesign} %math font close to Charis SIL
+ \RequirePackage[no-math]{fontspec}
+ \setmainfont{CharisSIL}
+ \RequirePackage{FiraSans} %sf font; download from https://www.fontsquirrel.com/fonts/fira-sans
+ \RequirePackage{amssymb}
+ \RequirePackage{textcomp}
+ \else\relax
+\fi
+
+% Strong widow and orphan control
+
+\clubpenalty10000
+\widowpenalty10000
+
+%=====================================================================
+%========================= required packages =========================
+%%% xunicode is not compatible
+%%% with pdflatex, and one should not use inputenc with xelatex
+%%% LuaLaTeX is incompatible with xunicode, but can safely load tipa
+\RequirePackage{iftex}
+\ifXeTeX
+ \RequirePackage{xunicode} %IPA characters are displayed; the commands of the tipa package are understood
+\else
+ \RequirePackage[safe]{tipa}
+\fi
+
+\ifPDFTeX
+ \RequirePackage[utf8]{inputenc}
+\else
+\fi
+%%% End modification
+
+\RequirePackage{xspace}
+% microtype handles punctuation at the right margin. We want it for the final product, but it's okay if authors lack it. MODIFIED by Coretta 2022-05-23 comment out microtype to circumvent error during compilation
+% \IfFileExists{microtype.sty}{%
+% \RequirePackage[final,protrusion={true,compatibility}]{microtype}
+% }{}
+\RequirePackage{ifthen}
+\RequirePackage[hyphens]{url}
+
+\if@biblatex
+ \RequirePackage[backend=biber,
+ bibstyle=biblatex-gl,
+ citestyle=gl-authoryear-comp,
+ maxcitenames=3,
+ maxbibnames=99]{biblatex}
+\else
+ \RequirePackage{natbib}
+ \bibpunct[: ]{(}{)}{; }{a}{}{;~}
+ \newcommand{\BIBand}{\&}
+ \setlength{\bibsep}{0pt}
+ \setlength{\bibhang}{0.25in}
+ \bibliographystyle{glossa}
+ \newcommand{\posscitet}[1]{\citeauthor{#1}'s (\citeyear{#1})}
+ \newcommand{\posscitealt}[1]{\citeauthor{#1}'s \citeyear{#1}}
+ \newcommand{\possciteauthor}[1]{\citeauthor{#1}'s}
+ \newcommand{\pgposscitet}[2]{\citeauthor{#1}'s (\citeyear{#1}:~#2)}
+ \newcommand{\secposscitet}[2]{\citeauthor{#1}'s (\citeyear{#1}:~$\S$#2)}
+ \newcommand{\pgcitealt}[2]{\citealt{#1}:~#2}
+ \newcommand{\seccitealt}[2]{\citealt{#1}:~$\S$#2}
+ \newcommand{\pgcitep}[2]{(\citealt{#1}:~#2)}
+ \newcommand{\seccitep}[2]{(\citealt{#1}:~$\S$#2)}
+ \newcommand{\pgcitet}[2]{\citeauthor{#1} (\citeyear{#1}:~#2)}
+ \newcommand{\seccitet}[2]{\citeauthor{#1} (\citeyear{#1}:~$\S$#2)}
+\fi
+
+\RequirePackage[usenames,dvipsnames]{xcolor}
+\definecolor{splinkcolor}{rgb}{.0,.2,.4}
+\RequirePackage[colorlinks,breaklinks,
+ linkcolor=splinkcolor,
+ urlcolor=splinkcolor,
+ citecolor=splinkcolor,
+ filecolor=splinkcolor,
+ plainpages=false,
+ pdfpagelabels,
+ bookmarks=false,
+ pdfstartview=FitH]{hyperref}
+\newcommand{\doi}[1]{\url{https://doi.org/#1}}
+\urlstyle{rm}
+\RequirePackage[leqno,tbtags]{amsmath}
+% If the author is using postscript (discouraged), then load the breakurl package, else don't load it.
+\RequirePackage{ifpdf}
+\ifpdf
+ \relax
+\else
+\relax
+ %\RequirePackage{breakurl}
+\fi
+\RequirePackage{graphicx}
+\RequirePackage{float}
+\RequirePackage[hang,FIGBOTCAP,loose]{subfigure}
+
+% additions to the S&P required packages for Glossa are listed below
+
+\RequirePackage[normalem]{ulem}
+\RequirePackage{enumitem}
+\RequirePackage[font=sf,labelfont=bf,labelsep=colon,justification=raggedright,singlelinecheck=off]{caption}
+\RequirePackage{booktabs}
+
+%=====================================================================
+%============================= metadata ==============================
+
+\def\@pdfauthor{\relax}
+\newcommand{\pdfauthor}[1]{\gdef\@pdfauthor{#1}}
+\def\@pdftitle{\relax}
+\newcommand{\pdftitle}[1]{\gdef\@pdftitle{#1}}
+\def\@pdfkeywords{\relax}
+\newcommand{\pdfkeywords}[1]{\gdef\@pdfkeywords{#1}}
+
+\hypersetup{pdfauthor=\@pdfauthor,
+ pdftitle=\@pdftitle,
+ pdfkeywords=\@pdfkeywords}
+
+\def\@spvolume{\relax}
+\newcommand{\spvolume}[1]{\gdef\@spvolume{#1}}
+
+\def\@sparticle{\relax}
+\newcommand{\sparticle}[1]{\gdef\@sparticle{#1}}
+
+\def\@spyear{\relax}
+\newcommand{\spyear}[1]{\gdef\@spyear{#1}}
+
+\def\@spdoi{10.5334/.\@spvolume.\@sparticle}
+\def\@splastpage{\relax}
+\newcommand{\splastpage}[1]{\gdef\@splastpage{#1}}
+
+%=====================================================================
+%========================== page dimensions ==========================
+
+% Vertical.
+\paperheight=297mm
+\topmargin=-13mm %
+\headheight=5mm % head: 30mm (margin + head + sep = 0.46cm); latex adds 1in)
+\headsep=17.6mm %
+\topskip=0.1in % included in the textheight
+\textheight=237mm % (297mm - 60mm)
+\footskip=0.46cm % foot: 30mm total (1.0in leftover)
+\parskip=0pt
+
+% Horizontal.
+\paperwidth=210mm
+\textwidth=150mm % (210mm - 60mm)
+\oddsidemargin=0.46cm % put at 3cm margins (3cm - 1in = 0.46cm)
+\evensidemargin=0.46cm % put at 3cm margins (3cm - 1in = 0.46cm)
+\raggedbottom % constant spacing in the text; cost is a ragged bottom
+\parindent=0.1in
+\leftmargini=0.5in
+\@ifundefined{mathindent}{}{\mathindent=0.5in\relax}%
+
+% Tell dvips about our paper.
+\special{papersize=210mm,297mm}
+
+%=====================================================================
+%============================== title ================================
+
+% Formats individual pairs inside \author.
+\newcommand{\spauthor}[1]%
+{\begin{minipage}[t]{16pc}\centering
+ #1%
+ \end{minipage}\hspace{.5pc plus1pc}%
+ \ignorespaces
+}
+
+\renewcommand*{\title}[2][]{\gdef\@shorttitle{#1}\gdef\@title{#2}}
+\renewcommand*{\author}[2][]{\gdef\@shortauthor{#1}\gdef\@author{#2}}
+
+% Adapted from JMLR.
+\renewcommand{\maketitle}{%
+ \par
+ \begingroup
+ \renewcommand{\thefootnote}{\fnsymbol{footnote}}
+ \@maketitle\@thanks
+ \setcounter{footnote}{0}
+ \endgroup
+ \let\maketitle\relax \let\@maketitle\relax
+ \gdef\@thanks{}
+ \let\thanks\relax%
+}
+
+% From salt.cls.
+\newskip\onelineskip
+\onelineskip=\baselineskip
+\advance\onelineskip by0pt plus 4pt minus 2pt
+
+\def\@maketitle{%
+ \vbox{\hsize\textwidth%
+ \linewidth\hsize%
+ \centering
+ \vskip\onelineskip
+ \LARGE\@title\@@par
+ \normalsize
+ \def\institute{\textit}%
+ \newcommand{\AND}{\ignorespaces}%
+ \let\par\@empty
+ \@author
+ \lineskiplimit\onelineskip
+ \lineskip\onelineskip
+ \@@par
+ }%
+ \global\everypar{\everypar{}\vskip 3.5ex}
+}
+
+%=====================================================================
+%========================== running headers ==========================
+
+% Creative commons license text. The font is even smaller here than it is elsewhere in the headers so that we have a chance of fitting the whole license on the page.
+\newcommand{\cctext}{{\footnotesize This is an open-access article distributed under the terms of a Creative Commons Attribution License
+ (\http{http://creativecommons.org/licenses/by/3.0/}).}}
+
+% This boolean switch lets the user control whether the logo is included even when the requisite image file is present. (If it is missing, then the class file accommodates that no matter how the switch is set.)
+\newboolean{logo}
+\setboolean{logo}{true} % Default true (include logo if it's present)
+\newcommand{\splogo}{\setboolean{logo}{true}}
+\newcommand{\nosplogo}{\setboolean{logo}{false}}
+
+% This sets the font size for the header and footer on all pages.
+\newcommand{\headerfontsize}{\footnotesize}
+
+% Prints publication and copyright info on the first page
+% Also loads info into metadata (superseded by new metadata commands)
+\gdef\@articlenumber{}%
+\newcommand{\firstpageheadings}[6]%
+{
+ \gdef\@articlenumber{#2}
+ \gdef\@spvolume{#1}
+ \gdef\@sparticle{#2}
+ \gdef\@splastpage{#3}
+ \gdef\@spyear{#4}
+ \def\ps@spfirstheadings{%
+ \let\@mkboth\@gobbletwo%
+ \renewcommand{\@oddhead}{%
+ \headerfontsize%
+ % If the switch is set to "include image",
+ \ifthenelse{\boolean{logo}}{%
+ \ifpdf
+ % If the pdf logo is present,
+ \IfFileExists{sp-logo.pdf}{%
+ % then insert the pdf version,
+ \begin{minipage}[c]{.25in}
+ \includegraphics[width=.25in]{sp-logo.pdf}
+ \end{minipage}%
+ }{}% else nothing; closes \IfFileExists
+ \else
+ % If the ps logo is present,
+ \IfFileExists{sp-logo.ps}{%
+ % then insert the postscript version,
+ \begin{minipage}[c]{.25in}
+ \includegraphics[width=.25in]{sp-logo.ps}
+ \end{minipage}%
+ }{}% else, nothing; closes \IfFileExists
+ \fi % close \ifpdf
+ }{}% closes \ifthenelse
+ \begin{minipage}[c]{5.25in}
+ \href{http://glossa.ubiquitypress.com/}{Glossa} Volume \@spvolume, Article \@sparticle: 1--\@splastpage, \@spyear\\
+ \href{https://doi.org/10.5334/sp.\@spvolume.\@sparticle}{https://doi.org/10.5334/.\@spvolume.\@sparticle}
+ \end{minipage}%
+ }%
+ \renewcommand{\@oddfoot}{%
+ \begin{minipage}[c]{1\textwidth}%
+ \footnotesize\copyright \@spyear\ \@shortauthor\\
+ \cctext
+ \end{minipage}%
+ }%
+ \renewcommand{\@evenhead}{}%
+ \renewcommand{\@evenfoot}{}%
+ }%
+ \thispagestyle{spfirstheadings}%
+}%
+
+\newcommand{\firstpagefinalheadings}%
+{
+ \def\ps@spfirstheadings{%
+ \let\@mkboth\@gobbletwo%
+ \renewcommand{\@oddhead}{%
+ \headerfontsize%
+ % If the switch is set to "include image",
+ \ifthenelse{\boolean{logo}}{%
+ \ifpdf
+ % If the pdf logo is present,
+ \IfFileExists{sp-logo.pdf}{%
+ % then insert the pdf version,
+ \begin{minipage}[c]{.25in}
+ \includegraphics[width=.25in]{sp-logo}
+ \end{minipage}%
+ }{}% else nothing; closes \IfFileExists
+ \else
+ % If the ps logo is present,
+ \IfFileExists{sp-logo.ps}{%
+ % then insert the postscript version,
+ \begin{minipage}[c]{.25in}
+ \includegraphics[width=.25in]{sp-logo}
+ \end{minipage}%
+ }{}% else, nothing; closes \IfFileExists
+ \fi % close \ifpdf
+ }{}% closes \ifthenelse
+ \begin{minipage}[c]{5.25in}
+ \href{http://http://glossa.ubiquitypress.com/}{Glossa} Volume \@spvolume, Article \@sparticle: 1--\@splastpage, \@spyear\\
+ \href{https://doi.org/\@spdoi}{https://doi.org/\@spdoi}
+ \end{minipage}%
+ \gdef\@articlenumber{\@sparticle}
+ }%
+ \renewcommand{\@oddfoot}{%
+ \begin{minipage}[c]{1\textwidth}%
+ \footnotesize\copyright \@spyear\ \@shortauthor\\
+ \cctext
+ \end{minipage}%
+ }%
+ \renewcommand{\@evenhead}{}%
+ \renewcommand{\@evenfoot}{}%
+ }%
+ \thispagestyle{spfirstheadings}%
+}%
+
+
+% Prints abbreviated article information on non-initial pages.
+\def\ps@spheadings{%
+ \let\@mkboth\@gobbletwo%
+ \def\@oddhead{{\headerfontsize\sffamily \@shorttitle}\hfill{\headerfontsize\sffamily\ifx\@empty\@articlenumber\else\@articlenumber:\fi\thepage}}% short title, inner
+ \def\@oddfoot{%\hfill{\headerfontsize\ifx\@empty\@articlenumber\else\@articlenumber:\fi%\thepage}\hfill
+ }% centered pg no
+ \def\@evenhead{{\headerfontsize\sffamily\ifx\@empty\@articlenumber\else\@articlenumber:\fi\thepage} \hfill {\headerfontsize\sffamily\@shortauthor}}% author names, inner
+ \def\@evenfoot{%\hfill{\headerfontsize\ifx\@empty\@articlenumber\else\@articlenumber:\fi\thepage}\hfill
+ }% centered pg no
+}
+\pagestyle{spheadings}
+
+%=====================================================================
+%=========================== final typeset ===========================
+
+
+\if@final
+\RequirePackage{sp-hyperxmp}
+\splogo
+\AtBeginDocument{\firstpagefinalheadings}
+\else
+\nosplogo
+\AtBeginDocument{\thispagestyle{plain}}
+\fi
+
+
+%=====================================================================
+%=========================== frontmatter =============================
+
+% The spacing specs (arg 2 of \list) are the same for the mshistory, abstract, and keywords environments, except that the abstract is indented somewhat.
+
+% Single parameter determines the left and right margin size.
+\newcommand{\frontmatterspacing}[1]{%
+ \small
+ \topsep 10\p@ \@plus4\p@ \@minus6\p@ % from size12.clo
+ \advance\topsep by3.5ex plus -1ex minus -.2ex
+ \setlength{\listparindent}{0em}
+ \setlength{\itemindent}{0em}
+ \setlength{\leftmargin}{#1}
+ \setlength{\rightmargin}{\leftmargin}
+ \setlength{\parskip}{0em}
+}
+
+\newenvironment{mshistory}%
+{\list{}{\frontmatterspacing{0em}}%
+\item\relax}%
+{\endlist}
+
+\renewenvironment{abstract}%
+{\list{}{\frontmatterspacing{0.25in}}%
+\item\relax\textbf{\abstractname} }%
+{\endlist}
+
+\newenvironment{keywords}%
+{\list{}{\frontmatterspacing{0em}}%
+\item\relax\textbf{Keywords:}}%
+{\endlist}
+
+%=====================================================================
+%============================ sectioning =============================
+
+\setcounter{secnumdepth}{5}
+\setcounter{tocdepth}{5}
+
+\renewcommand{\thesection}{\arabic{section}.}
+\renewcommand{\thesubsection}{\arabic{section}.\arabic{subsection}}
+\renewcommand{\thesubsubsection}{\arabic{section}.\arabic{subsection}.\arabic{subsubsection}}
+
+\renewcommand{\section}{\@startsection{section}{1}{0pt}%
+ {-3.5ex plus -1ex minus -.2ex}%
+ {1.8ex plus.2ex}%
+ {\noindent\normalfont\Large\sffamily\bfseries}}
+
+\renewcommand{\subsection}{\@startsection{subsection}{2}{0pt}%
+ {-3.5ex plus -1ex minus -.2ex}%
+ {1.8ex plus.2ex}%
+ {\noindent\normalfont\large\sffamily\bfseries}}
+
+\renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0pt}%
+ {-3.5ex plus -1ex minus -.2ex}%
+ {1.8ex plus.2ex}%
+ {\noindent\normalfont\normalsize\sffamily\bfseries}}
+
+\renewcommand{\paragraph}{\@startsection{paragraph}{4}{\z@}%
+ {-3.5ex plus -1ex minus -.2ex}%
+ {1.8ex plus.2ex}%
+ {\noindent\normalsize\sffamily\bfseries}}
+
+\renewcommand{\subparagraph}{\@startsection{subparagraph}{5}{\z@}%
+ {-3.5ex plus -1ex minus -.2ex}%
+ {1.8ex plus.2ex}%
+ {\noindent\normalsize\sffamily\it}}
+
+% General formatting --- for punctuating section headings.
+\renewcommand{\@seccntformat}[1]{\@nameuse{the#1}\hspace{1em}}
+
+%=====================================================================
+%============================ footnotes ==============================
+
+\renewcommand{\@makefntext}[1]{%
+ \parindent=0.25in
+ \noindent \hbox to \z@{\hss{\textsuperscript{\@thefnmark}} \hfil}#1}
+
+%=====================================================================
+%============================ backmatter =============================
+
+% Environment for formatting all the addresses.
+\newenvironment{addresses}%
+{\@@par
+ \let\par\@empty
+ \addvspace{3.25ex}%
+ \noindent%\textbf{Author addresses}
+ \small
+ % Individual author addresses.
+ \newenvironment{address}%
+ {% For email addresses inside the address environment.
+ %\newcommand{\email}{\texttt}%
+ \begin{minipage}[t]{19pc}\raggedright}
+ {\end{minipage}\hspace{.15pc plus1pc}}%
+ \ignorespaces
+}%
+{\lineskiplimit 1pc
+ \lineskip 1pc
+ \@@par}
+
+%=====================================================================
+%======================== other environments =========================
+
+% enumerate labeling that won't conflict with standard ex. numbers.
+\renewcommand{\theenumi}{\roman{enumi}}
+\renewcommand{\labelenumi}{\theenumi.}
+\renewcommand{\theenumii}{\alph{enumii}}
+\renewcommand{\labelenumii}{\theenumii.}
+
+% list spacing ought to satisfy \itemsep + \parsep < \topsep + \parskip
+\def\@listi{\leftmargin\leftmargini
+ \parsep 4\p@ \@plus2\p@ \@minus\p@
+ \topsep 10\p@ \@plus4\p@ \@minus\p@
+ \itemsep4\p@ \@plus2\p@ \@minus\p@}
+\let\@listI\@listi
+\@listi
+
+% typeset figures with lines above and below and the caption title and caption texts in boxes next to each other, top aligned. these definitions extend those of float.sty.
+%
+% sp caption style
+%\newcommand\floatc@sp[2]{%
+% \parbox[t]{0.15\textwidth}{{\@fs@cfont#1}}%
+% \parbox[t]{0.85\textwidth}{#2}}%
+
+% sp float style; uses the sp caption style
+%\newcommand\fs@sp{
+% \def\@fs@cfont{\bfseries}\let\@fs@capt\floatc@sp
+% \def\@fs@pre{\hrule\kern5pt}%
+% \def\@fs@post{\kern5pt\hrule\relax}%
+% \def\@fs@mid{\kern10pt}
+% \let\@fs@iftopcapt\iffalse}
+
+% users can override these commands using float.sty's functionality
+%\floatstyle{sp}
+%\restylefloat{figure}
+%\restylefloat{table}
+
+%=====================================================================
+%=========================== useful macros ===========================
+
+\newcommand{\spj}{\emph{S\&P}\xspace}
+
+\def\co{\colon\thinspace}
+
+\DeclareRobustCommand\dash{%
+ \unskip\nobreak\thinspace\textemdash\thinspace\ignorespaces}
+\pdfstringdefDisableCommands{\renewcommand{\dash}{ - }}
+
+% based on \url defined in hyperref.sty
+\DeclareRobustCommand*{\http}{\hyper@normalise\http@}
+\def\http@#1{\hyper@linkurl{\Hurl{#1}}{http://#1}}
+
+\newcommand{\email}[1]{\href{mailto:#1}{#1}}
+
+\providecommand{\sv}[1]{\ensuremath{\llbracket #1 \rrbracket}}
+
+%=====================================================================
+%=========================== linguex settings ========================
+
+\if@linguex
+ \RequirePackage{linguex}%
+ \renewcommand{\firstrefdash}{}%
+ \AtBeginDocument{\settowidth{\Exlabelwidth}{(110)}}
+\else
+\relax
+\fi
+
+\RequirePackage{cgloss} %for adding the language name and source of the example on the first line of glossed examples (requires \gll before the foreign language example and \glt before the translation)
+
+%================================ miscellaneous ======================
+%=====================================================================
+
+\setlist{nolistsep} %reduce space between items in lists
diff --git a/Papers/Paper/manuscript.Rmd b/Papers/Paper/manuscript.Rmd
index d602c02..5b6e56d 100644
--- a/Papers/Paper/manuscript.Rmd
+++ b/Papers/Paper/manuscript.Rmd
@@ -22,7 +22,7 @@ header-includes:
- \setcounter{bottomnumber}{3}
- \setcounter{totalnumber}{4}
-classoption: [times, biblatex]
+classoption: [times]
#
# Possible classoptions:
# - [times] for Times font (default if no option is chosen)
@@ -50,7 +50,6 @@ classoption: [times, biblatex]
# (for US cities in the format "Santa Cruz, CA")
#
bibliography: ["r-references.bib","refs.bib"]
-biblio-style: "apalike"
link-citations: true
# The bibliography style is set automatically by glossa.cls when using
# either natbib or biblatex.
@@ -59,15 +58,15 @@ link-citations: true
abstract: |
Behavioral measures of word-by-word reading time provide experimental evidence to test theories of language processing. A-maze is a recent method for measuring incremental sentence processing that can localize slowdowns related to syntactic ambiguities in individual sentences. We adapted A-maze for use on longer passages and tested it on the Natural Stories corpus. Participants were able to comprehend these longer text passages that they read via the Maze task. Moreover, the Maze task yielded useable reaction time data with word predictability effects that were linearly related to surprisal, the same pattern found with other incremental methods. Crucially, Maze reaction times show a tight relationship with properties of the current word, with little spillover of effects from previous words. This superior localization is an advantage of Maze compared with other methods. Overall, we expanded the scope of experimental materials, and thus theoretical questions, that can be studied with the Maze task.
# Specify keywords here:
-keywords: A-Maze, self-paced reading, incremental processing, surprisal, naturalistic text
+keywords: A-maze, self-paced reading, incremental processing, surprisal, naturalistic text
# To add the word count, uncomment the following option and replace the number
# with the document word count.
wordcount: 9594
+csl : "apa7.csl"
output:
- bookdown::pdf_book:
- base_format: rticles::glossa_article
- number_sections: yes
-# papaja::apa6_pdf
+ bookdown::pdf_book:
+ base_format: rticles::glossapx_article
+ #papaja::apa6_pdf
---
```{r setup, include = FALSE}
@@ -95,17 +94,17 @@ options(knitr.table.format = "pdf")
# Introduction
-Two chief results of human language processing research are that comprehension is highly incremental and that comprehension difficulty is differential and localized. Incrementality in comprehension means that our minds do not wait for large stretches of linguistic input to accrue; rather, we eagerly analyze each moment of input and rapidly integrate it into context [@marslen-wilson:1975]. Differential and localized processing difficulty means that different inputs in context present different processing demands during comprehension [@levy:2008]. Due to incrementality these differential processing demands are, by and large, met relatively quickly by the mind once they are presented, and they can be measured in both brain [@kutas-hillyard:1980; @osterhout-holcomb:1992jml] and behavioral [@raynerEyeMovementsReading1998; @mitchell:2004online-methods] responses. These measurements often have low signal-to-noise ratio, and many methods require bringing participants into the lab and often require cumbersome equipment. However, they can provide considerable insight into how language processing unfolds in real time. Developing more sensitive methods that can easily be used with remote participants is thus of considerable interest.
+Two chief results of human language processing research are that comprehension is highly incremental and that comprehension difficulty is differential and localized. Incrementality in comprehension means that our minds do not wait for large stretches of linguistic input to accrue; rather, we eagerly analyze each moment of input and rapidly integrate it into context [@marslen-wilson:1975]. Differential and localized processing difficulty means that different inputs in context present different processing demands during comprehension [@levy:2008]. Due to incrementality, these differential processing demands are, by and large, met relatively quickly by the mind once they are presented, and they can be measured in both brain [@kutas-hillyard:1980; @osterhout-holcomb:1992jml] and behavioral [@raynerEyeMovementsReading1998; @mitchell:2004online-methods] responses. These measurements often have low signal-to-noise ratio, and many methods require bringing participants into the lab and often require cumbersome equipment. However, they can provide considerable insight into how language processing unfolds in real time. Developing more sensitive methods that can easily be used with remote participants is thus of considerable interest.
-Word-by-word reading or response times are among the most widely used behavioral measurements in language comprehension and give relatively direct insight into processing difficulty. The Maze task [@freedman85; @forsterMazeTaskMeasuring2009], which involves collecting participants' response times in a repeated two-alternative forced-choice between a word that fits the preceding linguistic context and a distractor that doesn't, has recently been proposed as a high-sensitivity method that can easily be used remotely. @boyceMazeMadeEasy2020 introduced several implementational innovations that made it easier for researchers to use Maze, and showed that for several controlled syntactic processing contrasts [@witzelComparisonsOnlineReading2012a] Maze offers better statistical power than self-paced reading, the other word-by-word response time method easy to use remotely. Maze has since had rapid uptake in the language processing community [@chacon2021limits; @ungerer2021using; @orth2022processing;@van-lieburg-etal:2022-using-the-maze-task;@levinson:2022-beyond-surprising].
+Word-by-word reading or response times are among the most widely used behavioral measurements in language comprehension and give relatively direct insight into processing difficulty. The Maze task [@freedman85; @forsterMazeTaskMeasuring2009], which involves collecting participants' response times in a repeated two-alternative forced-choice between a word that fits the preceding linguistic context and a distractor that doesn't, has recently been proposed as a high-sensitivity method that can easily be used remotely. @boyceMazeMadeEasy2020 introduced several implementational innovations that made it easier for researchers to use Maze, and showed that for several controlled syntactic processing contrasts [@witzelComparisonsOnlineReading2012a]. Maze offers better statistical power than self-paced reading, the other word-by-word response time method easy to use remotely. Maze has since had rapid uptake in the language processing community [@chacon2021limits; @ungerer2021using; @orth2022processing;@van-lieburg-etal:2022-using-the-maze-task;@levinson:2022-beyond-surprising].
There is increasing interest in collecting data during comprehension of more naturalistic materials such as stories and news articles [@demberg-keller:2008; @lukeLimitsLexicalPrediction2016; @futrellNaturalStoriesCorpus2020], which offer potentially improved ecological validity and larger scale data in comparison with repeated presentation of isolated sentences out of context. These more naturalistic materials require maintaining and integrating discourse dependencies and other types of information over longer stretches of time and linguistic material. Previous work leaves unclear whether the Maze task would be feasible for this purpose: the increased task demands might interfere with the demands presented by these more naturalistic materials, and vice versa. In this paper we report a new modification of the Maze task and show that it makes reading of extended, naturalistic texts feasible. We also analyze the resulting reaction time profiles and show that they provide strong signal regarding the probabilistic relationship between a word and the context in which it appears, and that the systematic linear relationship between word surprisal and response time observed in other reading paradigms [@smithEffectWordPredictability2013] also arises in the Maze task.
-In the remainder of the Introduction, we lay out the role of RT-based methods in theory testing, describe a few common methods, and review some key influences on reading time. We then proceed to present our modified "error-correction Maze" paradigm, our experiment, and the results of our analyses of the resulting data.
+In the remainder of the introduction, we lay out the role of RT-based methods in theory testing, describe a few common methods, and review some key influences on reading time. We then proceed to present our modified "error-correction Maze" paradigm, our experiment, and the results of our analyses of the resulting data.
## Why measure RTs?
-A major feature of human language processing is that not all sentences or utterances are equally easy to successfully comprehend. Sometimes this is mostly or entirely due to the linguistic structure of the sentence: for example, *The rat that the cat that the dog chased killed ate the cheese* is more difficult than *The rat that was killed by the cat that was chased by the dog ate the cheese* even though the meaning of the two sentences is (near-)identical. Sometimes the source of difficulty can be a mismatch between expectations set up by the context and the word choice in an utterance: for example, the question *Is the cup red?* may be confusing in a context containing more than one cup. Psycholinguistic theories may differ in their ability to predict what is easy and what is hard. One of the most powerful methods for studying these differential difficulty effects is let the comprehender control the pace of presentation of the linguistic material, and to measure what she takes time on. For this purpose, taking measurements from experimental participants during reading, a widespread, highly practiced skill in diverse populations around the world, is of unparalleled value.
+A major feature of human language processing is that not all sentences or utterances are equally easy to successfully comprehend. Sometimes this is mostly or entirely due to the linguistic structure of the sentence: for example, *The rat that the cat that the dog chased killed ate the cheese* is more difficult than *The rat that was killed by the cat that was chased by the dog ate the cheese* even though the meaning of the two sentences is (near-)identical. Sometimes the source of difficulty can be a mismatch between expectations set up by the context and the word choice in an utterance: for example, the question *Is the cup red?* may be confusing in a context containing more than one cup. Psycholinguistic theories may differ in their ability to predict what is easy and what is hard. One of the most powerful methods for studying these differential difficulty effects is to let the comprehender control the pace of presentation of the linguistic material, and to measure what she takes time on. For this purpose, taking measurements from experimental participants during reading, a widespread, highly practiced skill in diverse populations around the world, is of unparalleled value.
To a first approximation, everyday reading (when the reader's goal is to understand a text's overall content) is *progressive*: we read documents, paragraphs, and sentences from beginning to end. The reader encounters each word with the benefit of the preceding linguistic context. Incrementality in reading involves successively processing each word encountered and integrating it into the context. For a skilled reader experienced with the type of text being read, most words are easy enough that the subjective experience of reading the text is of smooth, continuously unfolding understanding as we construct a mental model of what is being described. But occasionally a word may be sufficiently surprising or otherwise difficult to reconcile with the context that it disrupts comprehension to the level of conscious awareness: in the sentence *I take my coffee with cream and chamomile*, for example, the last word is likely to do so. Behaviorally, this disruption typically manifests as a slowdown or longer *reading time* (RT) on the word itself, on the immediately following words, or in other forms such as regressive eye movements back to earlier parts of the text to check the context.
@@ -115,22 +114,22 @@ For instance, competing theories about why certain types of object-extracted rel
Some of these theories rely on being able to attribute processing slowdowns to a particular word. Determining that object relatives are overall slower that subject relatives is easy. Even an imprecise RT measure will determine that the same set of words in a different order took longer to read at a sentence level. However, many language processing theories make specific (and contrasting) predictions about which words in a sentence are harder to process. To adjudicate among these theories, we want methods that are *well-localized*, so it is easy to determine which word is responsible for an observed RT slow-down. Ideally, a longer RT on a word would be an indication of that word's increased difficulty, and not the lingering signal of a prior word's increased difficulty. When the signal isn't localized, advanced analysis techniques may be required to disentangle the slow-downs [@shainDeconvolutionalTimeSeries2018].
-## Eye-tracking and Self-paced reading
+## Eye-tracking and self-paced reading
The two most commonly used behavioral methods for studying incremental language processing during reading are tracking eye movements and self-paced reading. While both of these have proven powerful and highly flexible, they both have important limitations as well.
In eye-tracking, participants read a text on a screen naturally, while their saccadic eye movements are recorded on a computer-connected camera that is calibrated so that the researcher can reconstruct with high precision where the participant's gaze falls on the screen at all times [@raynerEyeMovementsReading1998]. These eye movements can be used to reconstruct various position-specific reading time measures such as *gaze duration* (the total amount of time the eyes spend on a word the first time it is fixated before saccading to a later word) and *total viewing time* (the total amount of time that the word is fixated). If the eyes skipped the word the first time it was approached to the left, the trial is generally excluded. Eye tracking data collected with state-of-the-art high-precision recording equipment offers relatively good signal-to-noise ratio, but the difficulty presented by a word can still *spill over* into reading measures on subsequent words, a dynamic that can make it hard to isolate the source of an effect of potential theoretical interest [@raynerEffectsFrequencyPredictability2004; @levy-etal:2009pnas; @frazierMakingCorrectingErrors1982]. Short words such as articles and pronouns are often not fixated directly which makes it harder to study the processing of these words with eye-tracking. Additionally, the equipment is expensive and data collection is laborious and must occur in-lab.
-Self-paced reading (SPR; @mitchell:1984) is a somewhat less natural paradigm in which the participant manually controls the visual presentation of the text by pressing a button. In its generally preferred variant, moving-window self-paced reading, words are revealed one at a time or one group at a time: every press of the button masks the currently presented word (group) and simultaneously reveals the next. The time spent between button presses is the unique RT measure for that word (group). Self-paced reading requires no special equipment and can be delivered remotely, but the measurements are noisier and even more prone to spillover [@macdonaldInteractionLexicalSyntactic1993; @koornneefUseVerbbasedImplicit2006; @smithEffectWordPredictability2013].
+Self-paced reading (SPR) is a somewhat less natural paradigm in which the participant manually controls the visual presentation of the text by pressing a button [@mitchell:1984]. In its generally preferred variant, moving-window self-paced reading, words are revealed one at a time or one group at a time: every press of the button masks the currently presented word (group) and simultaneously reveals the next. The time spent between button presses is the unique RT measure for that word (group). Self-paced reading requires no special equipment and can be delivered remotely, but the measurements are noisier and even more prone to spillover [@macdonaldInteractionLexicalSyntactic1993; @koornneefUseVerbbasedImplicit2006; @smithEffectWordPredictability2013].
## Maze
-The Maze task is an alternative method that is designed to increase localization at the expense of naturalness [@freedman85; @forsterMazeTaskMeasuring2009]. In the Maze task, participants must repeatedly choose between two simultaneously presented options: a correct word that continues the sentence, and a distractor string which does not. Participants must choose the correct word, and their time to selection is treated as the reaction time, or RT. (We deliberately overload the abbreviation "RT" and use it for Maze reaction times as well as reading times from eye tracking and SPR, because the desirable properties of reading times turn out to hold for Maze reaction times as well.) @forsterMazeTaskMeasuring2009 introduced two versions of the Maze task: lexical "L"-maze where the distractors are non-word strings, and grammatical "G"-maze where the distractors are real words that don't fit with the context of the sentence. In theory, participants must fully integrate each word into the sentence in order to confidently select it, which may require mentally reparsing previous material in order to allow the integration and selection of a disambiguating word. @forsterMazeTaskMeasuring2009 call this need for full integration "forced incremental processing" to distinguish from other incremental processing methods where words can be passively read before later committing to a parse. This idea of strong localization is supported by studies finding strongly localized effects for G-maze [@witzelComparisonsOnlineReading2012a; @boyceMazeMadeEasy2020].
+The Maze task is an alternative method that is designed to increase localization at the expense of naturalness [@freedman85; @forsterMazeTaskMeasuring2009]. In the Maze task, participants must repeatedly choose between two simultaneously presented options: a correct word that continues the sentence, and a distractor string which does not. Participants must choose the correct word, and their time to selection is treated as the reaction time, or RT. (We deliberately overload the abbreviation "RT" and use it for Maze reaction times as well as reading times from eye tracking and SPR, because the desirable properties of reading times turn out to hold for Maze reaction times as well.) @forsterMazeTaskMeasuring2009 introduced two versions of the Maze task: lexical "L"-maze where the distractors are non-word strings, and grammatical "G"-maze where the distractors are real words that don't fit with the context of the sentence. In theory, participants must fully integrate each word into the sentence in order to confidently select it, which may require mentally reparsing previous material in order to allow the integration and selection of a disambiguating word. @forsterMazeTaskMeasuring2009 call this need for full integration "forced incremental sentence processing" in their title (p. 163) to distinguish from other incremental processing methods where words can be passively read before later committing to a parse. This idea of strong localization is supported by studies finding strongly localized effects for G-maze [@witzelComparisonsOnlineReading2012a; @boyceMazeMadeEasy2020].
The Maze task has less face-validity than eye-tracking or even SPR; repeated forced-choice selections does not seem very similar to normal reading. Despite this, @forsterMazeTaskMeasuring2009 report that "At a phenomenological level, participants typically report that they feel as if they are reading the sentence relatively
naturally and that the correct alternative seems to “leap out” at them, so that they do not have to inspect the incorrect
-alternative very carefully, if at all." This suggests that the Maze task may rely on the same language processing facilities tapped into by other reading methods. Thus, using Maze may not be the best paradigm for studying the process of normal reading, but may be perfectly good or even superior for getting at underlying language processing.
+alternative very carefully, if at all." (p. 164). This suggests that the Maze task may rely on the same language processing facilities tapped into by other reading methods. Thus, using Maze may not be the best paradigm for studying the process of normal reading, but may be perfectly good or even superior for getting at underlying language processing.
However, G-maze materials are effort-intensive to construct because of the need to select infelicitous words as distractors for each spot of each sentence. This burdensome preparation may explain why the Maze task was not widely adopted. @boyceMazeMadeEasy2020 demonstrated a way to automatically generate Maze distractors by using language models from Natural Language Processing to find words that are high surprisal in the context of the target sentence, and thus likely to be judged infelicitous by human readers. @boyceMazeMadeEasy2020 call Maze with automatically generated distractors A-maze. In a comparison, A-maze distractors had similar results to the hand-generated G-maze distractors from @witzelComparisonsOnlineReading2012a and A-maze outperformed L-maze and an SPR control in detecting and localizing expected slowdown effects. @sloggettAmazeAnyOther2020 also found that A-maze and G-maze distractors yielded similar results on a disambiguation paradigm.
@@ -141,9 +140,9 @@ Another recent variant of the Maze task is interpolated I-maze, which uses a mix
Localized measures can be used to attribute processing difficulty to individual words; however, to determine if a method is localized requires knowing how hard the words were to process. One approach is to look at properties of words that are known to influence reading times across methods such as eye-tracking and SPR. Longer words and lower frequency words tend to take longer to process [@klieglLengthFrequencyPredictability2004], as do less predictable words [@raynerEffectsFrequencyPredictability2004].
-A word can be unpredictable for a variety of reasons: it could be low frequency, semantically unexpected, the start of a low-frequency syntactic construction, or a word that disambiguates prior words to a less common parse. Many targeted effects of interest can thus be potentially accommodated theoretically as specific features that influence word predictability. ^[Of course, not all effects can necessarily be reduced to word predictability effects, and effects that *cannot* be reduced to word predictability may be of particular theoretical interest. Candidates include, for example, memory-based effects [@lewis-etal:2006; @levy-fedorenko-gibson:2013jml], noisy-channel error identification [@levy-etal:2009pnas], and the magnitude of processing difficulty in garden-path resolution [@van-schijndel-linzen-2021:single-stage-models; @wilcoxTargetedAssessmentIncremental2021].] Thus incremental processing methods that are sensitive to predictability are useful for testing linguistic theories that make predictions about what words are unexpected.
+A word can be unpredictable for a variety of reasons: it could be low frequency, semantically unexpected, the start of a low-frequency syntactic construction, or a word that disambiguates prior words to a less common parse. Many targeted effects of interest can thus be potentially accommodated theoretically as specific features that influence word predictability.^[Of course, not all effects can necessarily be reduced to word predictability effects, and effects that *cannot* be reduced to word predictability may be of particular theoretical interest. Candidates include, for example, memory-based effects [@lewis-etal:2006; @levy-fedorenko-gibson:2013jml], noisy-channel error identification [@levy-etal:2009pnas], and the magnitude of processing difficulty in garden-path resolution [@van-schijndel-linzen-2021:single-stage-models; @wilcoxTargetedAssessmentIncremental2021].] Thus incremental processing methods that are sensitive to predictability are useful for testing linguistic theories that make predictions about what words are unexpected.
-The overall predictability of a word in a context can be estimated using language models that are trained on large corpora of language to predict what word comes next in a sentence. A variety of pre-trained models exist, with varied internal architectures and training methods, but all of them generate measures of predictability. Predictability is often measured in bits of surprisal, which is the negative log probability of a word [@hale:2001, @levy:2008]. 1 bit of surprisal means a word is expected to occur half the time, 2 bits is 1/4 of the time, etc.
+The overall predictability of a word in a context can be estimated using language models that are trained on large corpora of language to predict what word comes next in a sentence. A variety of pre-trained models exist, with varied internal architectures and training methods, but all of them generate measures of predictability. Predictability is often measured in bits of surprisal, which is the negative log probability of a word [@hale:2001; @levy:2008]. 1 bit of surprisal means a word is expected to occur half the time, 2 bits is 1/4 of the time, etc.
The functional form of the relationship between RTs from eye-tracking and SPR studies and the predictability of the words is linear in terms of surprisal [@smithEffectWordPredictability2013; @wilcoxPredictivePowerNeural2020; @goodkindPredictivePowerWord2018; @lukeLimitsLexicalPrediction2016], even when two important context-invariant word features known to influence RTs, length and frequency, are controlled for. Predictability reliably correlates with reading time over a wide range of surprisals found in natural-sounding texts, not just for words that are extremely expected or unexpected [@smithEffectWordPredictability2013]. If Maze RTs reflect the same processing as other methods, we expect to find a similar linear relationship with surprisal.
@@ -178,7 +177,7 @@ One of the benefits of the Maze task is that it forces incremental processing by
However, terminating sentences on errors means that we don't have RTs for words after a participant makes a mistake in an item. In traditional G-maze tasks, with hand-crafted distractors and attentive participants, errors are rare and data loss is a small issue. However, this data loss can be worse with A-maze materials and crowd-sourced participants [@boyceMazeMadeEasy2020]. The high errors are likely from some combination of participants guessing randomly and from auto-generated distractors that in fact fit the sentence; as @boyceMazeMadeEasy2020 noted, some distractors, especially early in the sentence, were problematic and caused considerable data loss.
-The high error rates could be improved by auto-generating better distractors or hand-replacing problematic ones, but that does not solve the fundamental problem with long items. Well-chosen distractors and attentive participants reduce the error rate, but the error rate will still compound over long materials. For instance, with a 1% error rate, `r round(.99**15*100)`% of participants would complete each 15-word sentence, but only `r round(.99**50*100)`% would complete a 50 word vignette, and `r round(.99**200*100)`% would complete a 200 word passage. In order to run longer materials, we needed something to do when participants made a mistake other than terminate the entire item.
+The high error rates could be improved by auto-generating better distractors or hand-replacing problematic ones, but that does not solve the fundamental problem with long items. Well-chosen distractors and attentive participants reduce the error rate, but the error rate will still compound over long materials. For instance, with a 1% error rate, `r round(.99**15*100)`% of participants would complete each 15-word sentence, but only `r round(.99**50*100)`% would complete a 50-word vignette, and `r round(.99**200*100)`% would complete a 200-word passage. In order to run longer materials, we needed something to do when participants made a mistake other than terminate the entire item.
As a solution, we introduce an *error-correction* variant of Maze shown in Figure \@ref(fig:diagram). When a participant makes an error, they see an error message and must try again to select the correct option, before continuing the sentence as normal. We make error-correction Maze available as an option in a modification of the Ibex Maze implementation introduced in @boyceMazeMadeEasy2020 (https://github.com/vboyce/Ibex-with-Maze). The code records both the RT to the first click and also the total RT until the correct answer is selected as separate values.
@@ -261,7 +260,7 @@ data_sentence <- data_before <- data_good %>%
spr <- read_rds(here("Data/SPR/first.rds")) %>% filter(correct>4) %>% select(WorkerId) %>% unique()
```
-We recruited 100 participants from Amazon Mechanical Turk in April 2020, and paid each participant $3.50 for roughly 20 minutes of work. We excluded data from those who did not report English as their native language, leaving `r data_filt %>% select(subject) %>% unique() %>% nrow()` participants. After examining participants' performance on the task (see Results for details), we excluded data from participants with less than 80% accuracy, removing participants whose behavior was consistent with random guessing. After this exclusion, `r some %>% select(subject) %>% unique() %>% nrow()` participants were left.
+We recruited 100 participants from Amazon Mechanical Turk in April 2020, and paid each participant $3.50 for roughly 20 minutes of work. We excluded data from those who did not report English as their native language, leaving `r data_filt %>% select(subject) %>% unique() %>% nrow()` participants. After examining participants' performance on the task (see results for details), we excluded data from participants with less than 80% accuracy, removing participants whose behavior was consistent with random guessing. After this exclusion, `r some %>% select(subject) %>% unique() %>% nrow()` participants were left.
## Procedure
Participants first gave their informed consent and saw task instructions. Then they read a short practice story in the Maze paradigm and answered 2 binary-choice practice comprehension questions, before reading one main story in the error-correction A-maze task. After the story, they answered 6 comprehension questions, commented on their experience, answered optional demographic questions, were debriefed, and were given a code to enter for payment. The experiment was implemented in Ibex (https://github.com/addrummond/ibex).
@@ -283,13 +282,14 @@ package_list <- c("tidyverse","brms","rstan","bookdown", "rticles", "papaja",
a <- cite_r("r-references.bib", pkgs=package_list, withhold=F, footnote=T)
+foo <- a$pkgs[1] |> str_sub(3,-3)
```
## Modeling approach
-Our analytic questions required multiple modeling approaches. To look at the functional form of the relationship between surprisal and RT data, we fit Generalized Additive Models (GAMs) to allow for non-linear relationships [@wood:2017GAMs]. GAM model summaries can be harder to interpret than those for linear models, so to measure effect sizes and assess spillover, we used linear mixed models. Finally, in order to determine which language model best predicts the RT data, we fit additional linear models with predictors from multiple language models to look at their relative contributions. All these models used surprisal, frequency, and length as predictors for RT. We considered these predictors from both the current and past word to account for the possibility of spillover effects in A-maze. For SPR comparisons, we included predictors from the current and past three words to account for known spillover effects. We conducted data processing and analyses using `r a$r`.
+Our analytic questions required multiple modeling approaches. To look at the functional form of the relationship between surprisal and RT data, we fit Generalized Additive Models (GAMs) to allow for non-linear relationships [@wood:2017GAMs]. GAM model summaries can be harder to interpret than those for linear models, so to measure effect sizes and assess spillover, we used linear mixed models. Finally, in order to determine which language model best predicts the RT data, we fit additional linear models with predictors from multiple language models to look at their relative contributions. All these models used surprisal, frequency, and length as predictors for RT. We considered these predictors from both the current and past word to account for the possibility of spillover effects in A-maze. For SPR comparisons, we included predictors from the current and past three words to account for known spillover effects. We conducted data processing and analyses using R Version 4.2.2 [@R-base] [^papaja_pkg_citations].
-`r a$pkgs`
+`r foo`
```{r}
surps <- read_rds(here("Prep_code/natural_stories_surprisals.rds"))
@@ -326,9 +326,9 @@ We created a set of predictor variables of frequency, word length, and surprisal
### Exclusions
-In the Maze task, the first word of every sentence is paired with a nonce (x-x-x) distractor rather than a real word (as there is no context to use to distinguish between real words); due to this difference, we excluded the first word of every sentence, leaving `r nrow(not_first)` words. We excluded words for which we didn't have surprisal or frequency information, leaving `r nrow(not_na)` words. We additionally excluded words that any model treated as being composed of multiple tokens (primarily words with punctuation), leaving `r nrow(not_gpt)` words[^1]. We excluded outlier RTs that were <100 or >5000 ms (<100 is likely a recording error, >5000 is likely the participant getting distracted). We exclude RTs from words where mistakes occurred or which occurred after a mistake in the same sentence. We only analyzed words where we had values for all predictors, which meant that if the previous word was unknown to a model, the word was excluded because of missing values for a lagged predictor.
+In the Maze task, the first word of every sentence is paired with a nonce (x-x-x) distractor rather than a real word (as there is no context to use to distinguish between real words); due to this difference, we excluded the first word of every sentence, leaving `r nrow(not_first)` words. We excluded words for which we didn't have surprisal or frequency information, leaving `r nrow(not_na)` words. We additionally excluded words that any model treated as being composed of multiple tokens (primarily words with punctuation), leaving `r nrow(not_gpt)` words.[^1] We excluded outlier RTs that were <100 or >5000 ms (<100 is likely a recording error, >5000 is likely the participant getting distracted). We exclude RTs from words where mistakes occurred or which occurred after a mistake in the same sentence. We only analyzed words where we had values for all predictors, which meant that if the previous word was unknown to a model, the word was excluded because of missing values for a lagged predictor.
-[^1]:Surprisals should be additive, but summing the surprisals for multi-token words gave some unreasonable responses. For instance, in one story the word king!\' has a surprisal of 64 under GRNN (context: The other birds gave out one by one and when the eagle saw this he thought, \'What is the use of flying any higher? This victory is in the bag and I am king!\'). While GPT-2 using byte-pair encoding that can split up words into multiple parts, excluding words it split up only excluded 30 words that were not already excluded by other models.
+[^1]:Surprisals should be additive, but summing the surprisals for multi-token words gave some unreasonable responses. For instance, in one story the word "king!\'" has a surprisal of 64 under GRNN (context: The other birds gave out one by one and when the eagle saw this he thought, \'What is the use of flying any higher? This victory is in the bag and I am king!\'). While GPT-2 using byte-pair encoding that can split up words into multiple parts, excluding words it split up only excluded 30 words that were not already excluded by other models.
### Model specification
To infer the shape of the relationship between our predictor variables and RTs, we fit generalized additive models (GAMs) using `R`'s `mgcv` package to predict the mean RT (after exclusions) for each word, averaging across participants from whom we obtained an unexcluded RT for that word. We centered but did not rescale the length and frequency predictors, and left surprisal uncentered for interpretability. We used smooth terms (`mgcv`'s `s()`) for surprisal and tensor product terms (`mgcv`'s `ti()`) for frequency-by-length effects and interactions. We use restricted maximum likelihood (REML) smoothing for parameter estimation. To more fully account for the uncertainty in the smoothing parameter estimates, we fit 101 bootstrap replicates of each GAM model; in Figures \@ref(fig:gam) and \@ref(fig:spr-gam), the best-fit lines derive from the mean estimated effect size across the bootstrap replicates, and the shaded areas indicate a 95\% bootstrap confidence interval on this effect size (the boundaries are the 2.5\% and 97.5\% quantiles of the bootstrapped replicates).
@@ -343,7 +343,7 @@ For model comparisons, we took by-item averaged data to aid in fast model fittin
## Do participants engage successfully?
-(ref:error-cap) A. Participant's accuracy on the Maze task (fraction of words selected correctly) versus their average reaction time (in ms). Many participants (marked in green) chose the correct word >80% of the time; others (in red) appear to be randomly guessing. B. Performance on the comprehension questions. Participants with low accuracy performed poorly on comprehension questions; Participants with >80% task accuracy tended to do well; their performance was roughly comparable to the performance of SPR participants from @futrellNaturalStoriesCorpus2020 on their first stories.
+(ref:error-cap) A. Accuracy on the Maze task (fraction of words selected correctly) versus their average reaction time (in ms). Many participants (marked in green) chose the correct word >80% of the time; others (in red) appear to be randomly guessing. B. Performance on the comprehension questions. Participants with low accuracy performed poorly on comprehension questions; Participants with >80% task accuracy tended to do well; their performance was roughly comparable to the performance of SPR participants from @futrellNaturalStoriesCorpus2020 on their first stories.
```{r errors, out.width="\\textwidth", fig.width=8, fig.height=3, fig.pos="ht", fig.cap="(ref:error-cap)"}
error_plot <- ggplot(data_error_summ, aes(x=pct_correct, y=mean_rt, color=accurate))+
@@ -383,7 +383,7 @@ Accuracy, or how often a participant chose the correct word over the distractor,
[^2]: To avoid biasing the average if a participant took a pause before returning to the task, RTs greater than 5 seconds were excluded. This exclusion removed `r nrow(data_long)` words, or `r round(nrow(data_long)/nrow(data_no_prac)*100, digits=2)`% of trials.
-Another cluster of participants (in red) sped through the task, seemingly clicking randomly. This bimodal distribution is likely due to the mix of workers on Mechanical Turk, as we did not use qualification cutoffs. We believe the high level of random guessing is an artifact of the subject population [@hauser2018], and we expect that following current recommendations for participant recruitment, such as using qualification cutoffs or another recruitment site would result in fewer participants answering randomly [@eyal2021;@peer2017].
+Another cluster of participants (in red) sped through the task, seemingly clicking randomly. This bimodal distribution is likely due to the mix of workers on Mechanical Turk, as we did not use qualification cutoffs. We believe the high level of random guessing is an artifact of the subject population [@hauser2018], and we expect that following current recommendations for participant recruitment, such as using qualification cutoffs or another recruitment site, would result in fewer participants answering randomly [@eyal2021;@peer2017].
@@ -483,7 +483,7 @@ both <- maze %>%
inner_join(spr_collapse)
ggplot(both, aes(x=maze, y=spr)) +
geom_point(alpha=.5, size=1) +
- labs(x="A-Maze", y="SPR") +
+ labs(x="A-maze", y="SPR") +
coord_fixed(ratio=1) +
geom_smooth(method="lm")
@@ -576,11 +576,11 @@ data_anything_goes <- data_filt %>%
-To assess the shape of the RT-surprisal relationship, we then fit generalized additive models (GAMs).[^14] For these models, we only included data that occurred before any mistakes in the sentence; due to limits of model vocabulary, words with punctuation and some uncommon or proper nouns were excluded. We used surprisals generated by 4 different language models for robustness. (See Methods for details on language models, exclusions, and model fit.)
+To assess the shape of the RT-surprisal relationship, we then fit generalized additive models (GAMs).[^14] For these models, we only included data that occurred before any mistakes in the sentence; due to limits of model vocabulary, words with punctuation and some uncommon or proper nouns were excluded. We used surprisals generated by 4 different language models for robustness. (See methods for details on language models, exclusions, and model fit.)
[^14]: Due to previous reports of a length--frequency interaction in RT measures [@kliegl-etal:2006], before pursuing our primary question of the functional form of the surprisal--RT relationship, as an exploratory measure we fit generalized additive models (GAMs) with not only the main effects but also the two-way interactions between surprisal, length, and frequency, for the current word and for the previous word. This analysis revealed significant effects of current-word and previous-word surprisal, current-word and previous-word length, and significant interactions of current-word frequency by length and frequency by surprisal. The other main effects and interactions did not reach statistical significance. (These are results from `mgcv`'s `summary()`; the $p$-values are approximate.) Appendix C provides tables and plots of these effects and interactions for GPT-2. The interactions can be summarized as long low-frequency words and surprising, high-frequency words as having especially long RTs; and surprising, low-frequency words as having shorter RTs than would otherwise be predicted. However, these effects are small in terms of variance explained compared to the current-word surprisal effect, which is by far the largest single effect in the model. For simplicity we therefore set aside the interaction terms involving surprisal for the remainder of this analysis.
-(ref:gam-cap) GAM results for the effect of current word surprisal (top) or previous word surprisal (bottom) on Maze reaction time (RT). Density of data is shown along the x-axis. The best-fit lines is from the mean estimated effect size across the bootstrap replicates, and the shaded areas indicate a 95\% bootstrap confidence interval on this effect size. For each of the 4 language models used, there is a linear relationship between current word surprisal and RT. The relationship between previous word surprisal and RT is much flatter.
+(ref:gam-cap) GAM results for the effect of current word surprisal (top) or previous word surprisal (bottom) on Maze reaction time (RT). Density of data is shown along the x-axis. The best-fit lines are from the mean estimated effect size across the bootstrap replicates, and the shaded areas indicate a 95\% bootstrap confidence interval on this effect size. For each of the 4 language models used, there is a linear relationship between current word surprisal and RT. The relationship between previous word surprisal and RT is much flatter.
```{r gam, out.width="\\textwidth", fig.width=8, fig.height=3, fig.pos="ht", fig.cap="(ref:gam-cap)"}
@@ -621,7 +621,7 @@ plot_grid(p2[[1]], p2[[2]], nrow=2, rel_heights = c(1, .3))
The main effects of current and previous word surprisals on RT are shown in Figure \@ref(fig:gam). Note that for each of the models, high-surprisal words are rare, with much of the data from words with between 0 and 15 bits of surprisal. All 4 models show a roughly linear relationship between current word surprisal and RT, especially in the region with more data. To determine the goodness of fit of a model in which word probability effects on RT are taken to be linear in surprisal, we also fit GAM models with both parametric linear and nonparametric non-linear terms for surprisal; for all but the 5-gram model, these analyses supported a linear effect of surprisal (Appendix D).
-(ref:spr-gam-cap) GAM results for the effect of current word surprisal (top) or the surprisal of an earlier word, up to 3 words back on SPR RT data [@futrellNaturalStoriesCorpus2020]. Density of data is shown along the x-axis. The best-fit lines is from the mean estimated effect size across the bootstrap replicates, and the shaded areas indicate a 95\% bootstrap confidence interval on this effect size.
+(ref:spr-gam-cap) GAM results for the effect of current word surprisal (top) or the surprisal of an earlier word, up to 3 words back on SPR RT data [@futrellNaturalStoriesCorpus2020]. Density of data is shown along the x-axis. The best-fit lines are from the mean estimated effect size across the bootstrap replicates, and the shaded areas indicate a 95\% bootstrap confidence interval on this effect size.
```{r spr-gam, out.width="\\textwidth", fig.width=8, fig.height=4, fig.pos="ht", fig.cap="(ref:spr-gam-cap)"}
gam1 <- read_rds(here("Analysis/models/bootstrapped_spr_GAM_surprisal_predictions.rds")) %>%
@@ -665,7 +665,7 @@ Comparing Maze and SPR, we see that both show a linear relationship, but Maze ha
One of the main claimed advantages of the Maze task is that it has better localization and less spillover than SPR. We examined how much spillover A-maze and SPR each had by fitting linear models with predictors from current and previous words. Large effects from previous words are evidence for spillover; effects of the current word dwarfing any lagged effects would be evidence for localization.
-We modeled reading time as a function of surprisal, frequency, and length as well as surprisal$\times$length and frequency$\times$length interactions. For all of these, we included the predictors for the current and previous word, and we centered, but did not rescale, all predictors. (See Methods for more details on these predictors and model fit process.) As with the GAM models, we used surprisal calculations from 4 different language models for robustness.
+We modeled reading time as a function of surprisal, frequency, and length as well as surprisal$\times$length and frequency$\times$length interactions. For all of these, we included the predictors for the current and previous word, and we centered, but did not rescale, all predictors. (See methods for more details on these predictors and model fit process.) As with the GAM models, we used surprisal calculations from 4 different language models for robustness.
(ref:coeffs) Point estimates and 95% credible intervals for coefficients predicted by fitted Bayesian regression models predicting A-maze RT. Units are in ms. Surprisal is per bit, length per character, and frequency per $log_2$ occurrence per billion words.
@@ -701,12 +701,12 @@ ggthemes::scale_color_solarized(accent="violet")+
```
-The Maze linear model effects are shown in Figure \@ref(fig:coeffs-maze) (See also Appendix B for a table of effects). Across all models, there were consistent large effects of length and surprisal at the current word, but minimal effects of frequency. This lack of frequency effects differs from the results usually reported for SPR and eye-tracking (though see @shainLargescaleStudyEffects2019). There was a small interaction between surprisal and length at the current word.
+The Maze linear model effects are shown in Figure \@ref(fig:coeffs-maze) (see also Appendix B for a table of effects). Across all models, there were consistent large effects of length and surprisal at the current word, but minimal effects of frequency. This lack of frequency effects differs from the results usually reported for SPR and eye-tracking [though see @shainLargescaleStudyEffects2019]. There was a small interaction between surprisal and length at the current word.
Crucially, the effects of previous word predictors are close to zero, and much smaller than the effects of surprisal and length of the current word, an indication that spillover is limited and effects are strongly localized.
-(ref:coeffs-spr) Point estimates and 95% confidence intervals (+/- 1.97 standard error) for coefficients predicted by fitted regression models predicting SPR RT. Units are in ms. Surprisal is per bit, length per character, and frequency per $log_2$ occurrence per billion words.
+(ref:coeffs-spr) Point estimates and 95% confidence intervals (+/-1.97 standard error) for coefficients predicted by fitted regression models predicting SPR RT. Units are in ms. Surprisal is per bit, length per character, and frequency per $log_2$ occurrence per billion words.
```{r coeffs-spr, out.width="\\textwidth", fig.width=8, fig.height=3, fig.pos="ht", fig.cap="(ref:coeffs-spr)"}
test_plot %>% filter(type=="SPR") %>% mutate(time=factor(time, levels=c("Current","Previous","2Previous","3Previous")))%>% ggplot(aes(y=Term, x=Estimate, group=model, color=model, shape=model))+
@@ -720,25 +720,26 @@ ggthemes::scale_color_solarized(accent="violet")+
```
-We ran similar models for SPR, although to account for known spillover effects, we consider predictors from the current and 3 previous words. Due to issues fitting models, the details of the models differed (see Methods). The SPR coefficients are shown in Figure \@ref(fig:coeffs-spr) (see also Appendix B for a table of coefficients). Surprisal, length, and frequency effects are all evident for the current word and surprisal and frequency show effects from the previous word as well. Unlike for Maze, with SPR there is not a clear diminishing of the size of the effects as one goes from current word to prior word predictors.
+We ran similar models for SPR, although to account for known spillover effects, we consider predictors from the current and 3 previous words. Due to issues fitting models, the details of the models differed (see methods). The SPR coefficients are shown in Figure \@ref(fig:coeffs-spr) (see also Appendix B for a table of coefficients). Surprisal, length, and frequency effects are all evident for the current word and surprisal and frequency show effects from the previous word as well. Unlike for Maze, with SPR there is not a clear diminishing of the size of the effects as one goes from current word to prior word predictors.
-Whereas Maze showed surprisal effects in the 10 to 25 ms/bit range and length effects in the 15 to 20 ms/character range, SPR effects are about 1-2 ms per bit or character. This difference in effect size is disproportionate to the overall speed of the methods; the predicted intercept for the Maze task was roughly 880 ms and for SPR was roughly 360 ms. Thus Maze is 2--3 times as slow as SPR but has roughly 10 times larger effects.
+Whereas Maze showed surprisal effects in the 10 to 25 ms/bit range and length effects in the 15 to 20 ms/character range, SPR effects are about 1 to 2 ms per bit or character. This difference in effect size is disproportionate to the overall speed of the methods; the predicted intercept for the Maze task was roughly 880 ms and for SPR was roughly 360 ms. Thus Maze is 2--3 times as slow as SPR but has roughly 10 times larger effects.
## Which language model fits best?
-Our last analysis question is whether some of the language models fit the human RT data better than others. We assessed each model's fit to A-maze data using log likelihood and R-squared. Then we did a nested model comparison, looking at whether a model with multiple surprisal predictors (ex, GRNN and GPT-2) had a better fit than a model with only one (ex GRNN alone).
+Our last analysis question is whether some of the language models fit the human RT data better than others. We assessed each model's fit to A-maze data using log likelihood and R-squared. Then we did a nested model comparison, looking at whether a model with multiple surprisal predictors (ex. GRNN and GPT-2) had a better fit than a model with only one (ex. GRNN alone).
As shown in Table \@ref(tab:maze-compare), GPT-2 provides a lot of additional predictive value over each other model, GRNN provides a lot over 5-gram and Transformer-XL and a little complementary information over GPT-2. Transformer-XL provides a lot over 5-gram, and 5-gram provides little over any model. The single-model measures of log likelihood confirm this hierarchy, as GPT-2 is better than GRNN is better than Transformer-XL is better than 5-gram.
```{r maze-compare}
-read_rds(here("Analysis/models/maze_model_compare.rds")) %>% knitr::kable(format="latex", position="ht",caption="Results of model comparisons on Maze data. Each row shows the additional predictive value gained from adding that model to another model. F values and p values from ANOVA tests between 1-surprisal-source and 2-source models are reported. We also report log likelihoods of models with only one surprisal source and the r-squared correlation between the model's predictions and the data.")
+read_rds(here("Analysis/models/maze_model_compare.rds")) |> mutate(`Log Lik`=str_replace(`Log Lik`,"-","--")) |>
+knitr::kable(format="latex", position="ht",caption="Results of model comparisons on Maze data. Each row shows the additional predictive value gained from adding that model to another model. F values and p values from ANOVA tests between 1-surprisal-source and 2-source models are reported. We also report log likelihoods of models with only one surprisal source and the r-squared correlation between the model's predictions and the data.")
```
-We followed the same process for the SPR data with results shown in Table \@ref(tab:spr-compare). For SPR, GPT-2 and 5-gram models contain some value over each other model, which is less clear for Transformer-XL and GRNN. In terms of log likelihoods, we find that GPT-2 is better than 5-gram is better than GRNN is better than Transformer-XL, although differences are small. The relatively good fit of 5-gram models to SPR data compared with neural models matches results from @huSystematicAssessmentSyntactic2020 and @wilcoxPredictivePowerNeural2020, and contrasts with the Maze results, where the 5-gram model had the worst fit and did not provide additional predictive value over the other models. While the nature of the generalizations made by these neural network-based models are not fully understood, controlled tests have suggested that their next-word predictions often reflect deeper features of linguistic structure [@warstadt-etal:2020-BLiMP; @huSystematicAssessmentSyntactic2020], such as subject--verb agreement [@marvin-linzen:2018-targeted] and wh-dependencies [@wilcox-etal:2022-using-computational-models], and are sensitive over longer context windows, than n-gram models. The fact that the neural language models dominate the 5-gram models for Maze but not SPR thus suggests that Maze RTs may be more sensitive than SPR RTs to richer language structure-related processes during real-time comprehension.
+We followed the same process for the SPR data with results shown in Table \@ref(tab:spr-compare). For SPR, GPT-2 and 5-gram models contain some value over each other model, which is less clear for Transformer-XL and GRNN. In terms of log likelihoods, we find that GPT-2 is better than 5-gram is better than GRNN is better than Transformer-XL, although differences are small. The relatively good fit of 5-gram models to SPR data compared with neural models matches results from @huSystematicAssessmentSyntactic2020 and @wilcoxPredictivePowerNeural2020, and contrasts with the Maze results, where the 5-gram model had the worst fit and did not provide additional predictive value over the other models. While the nature of the generalizations made by these neural network-based models is not fully understood, controlled tests have suggested that their next-word predictions often reflect deeper features of linguistic structure [@warstadt-etal:2020-BLiMP; @huSystematicAssessmentSyntactic2020], such as subject--verb agreement [@marvin-linzen:2018-targeted] and wh-dependencies [@wilcox-etal:2022-using-computational-models], and are sensitive over longer context windows, than n-gram models. The fact that the neural language models dominate the 5-gram models for Maze but not SPR thus suggests that Maze RTs may be more sensitive than SPR RTs to richer language structure-related processes during real-time comprehension.
```{r}
maze <- read_rds(here("Analysis/models/maze_model_compare.rds"))
@@ -750,28 +751,28 @@ As an overall measure of fit to data, we calculate multiple R-squared for the si
```{r spr-compare}
-read_rds(here("Analysis/models/spr_model_compare.rds")) %>% knitr::kable(format="latex", position="ht",caption="Results of model comparisons on SPR data. Each row shows the additional predictive value gained from adding that model to another model. F values and p values from ANOVA tests between 1-surprisal-source and 2-source models are reported. We also report log likelihoods of models with only one surprisal source and the r-squared correlation between the model's predictions and the data.")
+read_rds(here("Analysis/models/spr_model_compare.rds")) |> mutate(`Log Lik`=str_replace(`Log Lik`,"-","--")) |> knitr::kable(format="latex", position="ht",caption="Results of model comparisons on SPR data. Each row shows the additional predictive value gained from adding that model to another model. F values and p values from ANOVA tests between 1-surprisal-source and 2-source models are reported. We also report log likelihoods of models with only one surprisal source and the r-squared correlation between the model's predictions and the data.")
```
# Discussion
-We introduced error-correction Maze, a tweak on the presentation of Maze materials that makes Maze feasible for multi-sentence passages. We then used A-maze distractors and the error-correction Maze presentation to gather data on participants reading stories from the Natural Stories corpus in the Maze. As laid out in the Introduction, this current study addressed five main questions.
+We introduced error-correction Maze, a tweak on the presentation of Maze materials that makes Maze feasible for multi-sentence passages. We then used A-maze distractors and the error-correction Maze presentation to gather data on participants reading stories from the Natural Stories corpus in the Maze. As laid out in the introduction, this current study addressed five main questions.
First, we found that participants could read and comprehend the 1000 word stories, despite the slowness and added overhead of reading in the Maze task. This result expands the domain of materials usable with Maze beyond targeted single-sentence items to longer, naturalistic texts with sentence-to-sentence coherency.
-Second, we took advantage of the pre-existing SPR corpus on Natural Stories to compare the RT profiles between Maze and SPR. Maze and SPR pick up on similar features in words, as shown by the high correlations between Maze and SPR RTs on the sentence level. The correlation within Maze is higher than Maze to SPR correlation or SPR-SPR correlations, which is evidence that Maze is less noisy than SPR.
+Second, we took advantage of the pre-existing SPR corpus on Natural Stories to compare the RT profiles between Maze and SPR. Maze and SPR pick up on similar features in words, as shown by the high correlations between Maze and SPR RTs on the sentence level. The correlation within Maze is higher than the Maze to SPR correlation or SPR-SPR correlations, which is evidence that Maze is less noisy than SPR.
Third, we addressed whether the A-maze RT for a word showed a linear relationship with that word's surprisal. We found that A-maze RTs are linearly related to surprisal, matching the functional profile found with other incremental processing methods.
-Fourth, we compared the spillover profiles between Maze and SPR. For Maze, we found large effects of the current word's surprisal and length, which dwarfed any spillover effects from previous word predictors. In contrast, for SPR, we found effects of roughly equal sizes from the current and previous words. ^[Furthermore, the typical spillover profile for SPR data may be worse than suggested by the Natural Stories corpus SPR data: for example, @smithEffectWordPredictability2013 found that most of a word's surprisal effect showed up only one to two words downstream.] Overall, Maze is a slower task than SPR, but it also has much larger effects of length and surprisal, perhaps due to requiring more focus, and thus generating less noisy data.
-We do not find frequency effects on the Maze data, but we do on the SPR data. This could be explained if frequency effects are a first rough approximation of in-context predictability, before the fuller context-sensitive surprisal information is available. In this case, faster methods like eye-tracking and SPR would show frequency effects (in addition to surprisal), but slower methods like Maze would not as the additional demands slow down the response, allowing more contextual information to be used. While this is a difference between Maze and other incremental processing methods, we do not consider it a flaw for Maze---indeed, for researchers interested in focusing on context-contingent language processing, it may suggest an advantage for the Maze task. Regardless, these differences highlight the importance of understanding task demands of different incremental processing methods.
+Fourth, we compared the spillover profiles between Maze and SPR. For Maze, we found large effects of the current word's surprisal and length, which dwarfed any spillover effects from previous word predictors. In contrast, for SPR, we found effects of roughly equal sizes from the current and previous words^[Furthermore, the typical spillover profile for SPR data may be worse than suggested by the Natural Stories corpus SPR data: for example, @smithEffectWordPredictability2013 found that most of a word's surprisal effect showed up only one to two words downstream.]. Overall, Maze is a slower task than SPR, but it also has much larger effects of length and surprisal, perhaps due to requiring more focus, and thus generating less noisy data.
+We do not find frequency effects on the Maze data, but we do on the SPR data. This could be explained if frequency effects are a first rough approximation of in-context predictability, before the fuller context-sensitive surprisal information is available. In this case, faster methods like eye-tracking and SPR would show frequency effects (in addition to surprisal), but slower methods like Maze would not as the additional demands slow down the response, allowing more contextual information to be used. While this is a difference between Maze and other incremental processing methods, we do not consider it a flaw for Maze – indeed, for researchers interested in focusing on context-contingent language processing, it may suggest an advantage for the Maze task. Regardless, these differences highlight the importance of understanding task demands of different incremental processing methods.
Lastly, we examined how different language models fare at predicting human RT data. We found that overall, the models were more predictive of the A-maze data than SPR data; however, the hierarchy of the model's predictive performance also differed between the A-maze and SPR datasets. This difference suggests that how well a language model predicts human RTs may depend on task. Maze RTs were by far best predicted by neural network language models, whereas SPR RTs were predicted nearly as well by 5-gram models. Our understanding of the linguistic generalization capabilities and performance of these neural network models is still limited, and there are cases where they are known to make more superficial, non-human-like generalizations [@mccoy-etal-2019-right; @chaves-2020-dont], but controlled tests in the NLP literature that analyze their behavior on classic psycholinguistics paradigms [@linzen-etal:2016tacl; @futrell-etal:2019-neural-language-models; @warstadt-etal:2020-BLiMP; @wilcox-etal:2022-using-computational-models] suggest more human-like performance than n-gram models are capable of. These findings further add to the evidence that the Maze task is favorable for RT-based investigations of underlying linguistic processing in the human mind. More broadly, further comparisons between different processing methods on the same materials could be useful for a deeper understanding of how task demands influence language processing [ex. @bartekSearchOnlineLocality2011].
-Overall, A-maze has excellent localization, although some models showed small but statistically significant effects of the past word. On the whole, our results support the idea that Maze forces language processing to be close to word-by-word, and thus the Maze task can be used under the assumption that the RT of a word primarily reflects its own properties and not those of earlier words. Correlation analysis between Maze and SPR suggest that Maze is picking up on many of the same patterns as does SPR, but with less noise.
+Overall, A-maze has excellent localization, although some models showed small but statistically significant effects of the past word. On the whole, our results support the idea that Maze forces language processing to be close to word-by-word, and thus the Maze task can be used under the assumption that the RT of a word primarily reflects its own properties and not those of earlier words. Correlation analysis between Maze and SPR suggests that Maze is picking up on many of the same patterns as does SPR, but with less noise.
## Limitations
@@ -779,12 +780,33 @@ While we expect these patterns of results reflect features of the A-maze task, t
## Future directions
-Compared to traditional Maze,in error-correction Maze, participants' incentives to finish quickly are in less conflict with the experimenter's desire that participants do the task as intended. However, even with error-correction Maze, clicking randomly is still likely faster than doing the task. In discussing this work, we received the suggestion that one way to further disincentivize random clicking would be to add a pause when a participant makes a mistake, forcing them to wait some short period of time (ex 500ms) before correcting their mistake. This delay would make randomly hitting buttons slower than doing the task as intended, and we have made delaying after wrong presses an option in the error-correction Maze implementation at https://github.com/vboyce/Ibex-with-Maze.
+Compared to traditional Maze, in error-correction Maze, participants' incentives to finish quickly are in less conflict with the experimenter's desire that participants do the task as intended. However, even with error-correction Maze, clicking randomly is still likely faster than doing the task. In discussing this work, we received the suggestion that one way to further disincentivize random clicking would be to add a pause when a participant makes a mistake, forcing them to wait some short period of time, such as 500ms, before correcting their mistake. This delay would make randomly hitting buttons slower than doing the task as intended, and we have made delaying after wrong presses an option in the error-correction Maze implementation at https://github.com/vboyce/Ibex-with-Maze.
Error-correction Maze records RTs for words after a participant makes a mistake in the sentence. In our analyses, we excluded these post-error data, but we believe it is an open question whether data from after a participant makes a mistake is usable. That is, does it show the same profile as RTs from pre-error words, or are there traces from recovering from the mistake? If there are, how long do these effects take to fade? Whether post-mistake data is high-quality and trustworthy enough to be included in analyses is hard to assess; if it can be used, it would make the Maze task more data efficient.
The Maze task is versatile and can be used or adapted for a wide range of materials and questions of interest. Its forced incrementality makes the Maze task a good target for any question that requires precisely determining the locus of incremental processing difficulty. We encourage researchers to use Maze as an incremental processing method, alone or in comparison with other methods, and we suggest that the error-correction mode be the default choice for presenting Maze materials.
+# Data Accessibility {.unnumbered}
+Data and materials are available at https://github.com/vboyce/natural-stories-maze.
+# Ethics and consent {#ethics-and-consent-optional .unnumbered}
+
+This research was approved by MIT’s Committee on the Use of Humans as Experimental Subjects and run under protocol number 1605559077.
+
+# Funding information {#funding-information-optional .unnumbered}
+
+RPL acknowledges support from NSF grant BCS-2121074, NIH grant U01-NS121471, and the MIT–IBM Artificial Intelligence Research Lab.
+
+# Acknowledgements {#acknowledgements-optional .unnumbered}
+
+We thank the AMLAP 2020 audience, the Computational Psycholinguistics Lab at MIT, the Language and Cognition Lab at Stanford, the QuantLang Lab at UC Irvine, and Mike Frank for feedback on this work.
+
+# Competing interests {#competing-interests-mandatory .unnumbered}
+
+The authors have no competing interests to declare.
+
+# Authors' contributions {#contrib .unnumbered}
+VB contributed Conceptualization, Formal Analysis, Investigation, Methodology, Software, and Writing - Original Draft Preparation. RPL contributed Conceptualization, Formal Analysis, Funding Acquisition, Methodology, Supervision, and Writing - Review & Editing.
+
\newpage
@@ -798,7 +820,9 @@ The Maze task is versatile and can be used or adapted for a wide range of materi
\endgroup
-# Appendix A
+\newpage
+
+# Appendix A {.unnumbered}
The beginning of one of the stories. This excerpt is the first 200 words of a 1000 word story.
Tulip mania was a period in the Dutch Golden Age during which contract prices for bulbs of the recently introduced tulip reached extraordinarily high levels and then suddenly collapsed. At the peak of tulip mania in February sixteen thirty-seven, tulip contracts sold for more than ten times the annual income of a skilled craftsman. It is generally considered the first recorded economic bubble. The tulip, introduced to Europe in the mid sixteenth century from the Ottoman Empire, became very popular in the United Provinces, which we now know as the Netherlands. Tulip cultivation in the United Provinces is generally thought to have started in earnest around fifteen ninety-three, after the Flemish botanist Charles de l'Ecluse had taken up a post at the University of Leiden and established a botanical garden, which is famous as one of the oldest in the world. There, he planted his collection of tulip bulbs that the Emperor's ambassador sent to him from Turkey, which were able to tolerate the harsher conditions of the northern climate. It was shortly thereafter that the tulips began to grow in popularity. The flower rapidly became a coveted luxury item and a status symbol, and a profusion of varieties followed.
@@ -809,7 +833,7 @@ When did tulip mania reach its peak? 1630's, 1730's
From which country did tulips come to Europe? Turkey, Egypt
-# Appendix B
+# Appendix B {.unnumbered}
Full numerical results from the fitted regression models are shown in Table \@ref(tab:pre-error) for A-maze and in Table \@ref(tab:spr-table) for SPR.
@@ -880,7 +904,7 @@ knitr::kable(summ_spr, format="latex", position="!h",caption="Predictions from f
```
-# Appendix C
+# Appendix C {.unnumbered}
We use `mgcv`'s `ti()` tensor interaction terms to test all main effects and two-way interactions among frequency, length, and surprisal for the current word and for the previous word. These effects are visualized in Figure \@ref(fig:gam-grid) and `mgcv`'s approximate significance levels are give in Table \@ref(tab:gam-interact-table). Based on these approximate significance levels, the main effects of current and previous word surprisal and length are significant, as are the current-word frequency-by-length and frequency-by-surprisal interactions; other terms are not statistically significant. These significant interactions can be summarized as especially long, infrequent words being especially slow to select; especially frequent and surprising words being especially slow to select; and especially infrequent and surprising words being less slow to select than a main-effects-only model would predict. The data driving these interactions are in the sparse tails of the word length and surprisal distributions, and as the $F$ statistics in Table \@ref(tab:gam-interact-table) show, their variance explained is small relative to the large effect of current-word surprisal, so in the main-text analysis we set these interactions aside.
@@ -905,7 +929,7 @@ knitr::include_graphics(here("Analysis/models/gam_grid.png"))
-# Appendix D
+# Appendix D {.unnumbered}
The `mgcv` package's implementation of Generalized Additive Models [@wood:2017GAMs] allows linear and nonparametric spline effects of the same continuous predictor to be entered simultaneously into a model. Doing so associates only the nonlinear part of the effect to the spline term, allowing for approximate statistical testing of the linear and non-linear components of the effect respectively. We thus test for whether the effect of surprisal on A-Maze RTs is best described as linear or includes a non-linear component, using the `mgcv` formula:
@@ -936,22 +960,3 @@ knitr::kable(gam_linear, format="latex", position="ht",caption="Comparison of si
```
-# Ethics and consent {#ethics-and-consent-optional .unnumbered}
-
-This research was approved by MIT’s Committee on the Use of Humans as Experimental Subjects and run under protocol number 1605559077.
-
-# Funding information {#funding-information-optional .unnumbered}
-
-RPL acknowledges support from NSF grant BCS-2121074, NIH grant U01-NS121471, and the MIT–IBM Artificial Intelligence Research Lab.
-
-# Acknowledgements {#acknowledgements-optional .unnumbered}
-
-We thank the AMLAP 2020 audience, the Computational Psycholinguistics Lab at MIT, the Language and Cognition Lab at Stanford, the QuantLang Lab at UC Irvine, and Mike Frank for feedback on this work.
-
-# Competing interests {#competing-interests-mandatory .unnumbered}
-
-The authors have no competing interests to declare.
-
-# Authors' contributions {#contrib .unnumbered}
-VB contributed Conceptualization, Formal Analysis, Investigation, Methodology, Software, and Writing - Original Draft Preparation. RPL contributed Conceptualization, Formal Analysis, Funding Acquisition, Methodology, Supervision, and Writing - Review & Editing.
-
diff --git a/Papers/Paper/manuscript.pdf b/Papers/Paper/manuscript.pdf
index cc9c535..2635799 100644
Binary files a/Papers/Paper/manuscript.pdf and b/Papers/Paper/manuscript.pdf differ
diff --git a/Papers/Paper/r-references.bib b/Papers/Paper/r-references.bib
index f10cfba..6b5348c 100644
--- a/Papers/Paper/r-references.bib
+++ b/Papers/Paper/r-references.bib
@@ -2,7 +2,6 @@ @Manual{R-base
title = {R: A Language and Environment for Statistical Computing},
author = {{R Core Team}},
organization = {R Foundation for Statistical Computing},
- address = {Vienna, Austria},
year = {2022},
url = {https://www.R-project.org/},
}
@@ -10,7 +9,6 @@ @Book{R-bookdown
title = {bookdown: Authoring Books and Technical Documents with {R} Markdown},
author = {Yihui Xie},
publisher = {Chapman and Hall/CRC},
- address = {Boca Raton, Florida},
year = {2016},
note = {ISBN 978-1138700109},
url = {https://bookdown.org/yihui/bookdown},
diff --git a/Papers/Paper/refs.bib b/Papers/Paper/refs.bib
index eee187a..50ba426 100644
--- a/Papers/Paper/refs.bib
+++ b/Papers/Paper/refs.bib
@@ -60,7 +60,7 @@ @article{batesFittingLinearMixedeffects2015
@article{boyceMazeMadeEasy2020,
ids = {boyceMazeMadeEasy2020a},
- title = {Maze {{Made Easy}}: {{Better}} and Easier Measurement of Incremental Processing Difficulty},
+ title = {Maze Made Easy: {{Better}} and Easier Measurement of Incremental Processing Difficulty},
shorttitle = {Maze {{Made Easy}}},
author = {Boyce, Veronica and Futrell, Richard and Levy, Roger P.},
year = {2020},
@@ -87,7 +87,7 @@ @article{burknerAdvancedBayesianMultilevel2018
}
@article{chmielewski2020,
- title = {An {{MTurk Crisis}}? {{Shifts}} in {{Data Quality}} and the {{Impact}} on {{Study Results}}},
+ title = {An {{MTurk}} Crisis? {{Shifts}} in Data Quality and the Impact on Study Results},
shorttitle = {An {{MTurk Crisis}}?},
author = {Chmielewski, Michael and Kucker, Sarah C.},
year = {2020},
@@ -104,7 +104,7 @@ @article{chmielewski2020
}
@article{daiTransformerXLAttentiveLanguage2019,
- title = {Transformer-{{XL}}: {{Attentive Language Models Beyond}} a {{Fixed-Length Context}}},
+ title = {Transformer-{{XL}}: {{Attentive}} Language Models Beyond a Fixed-Length Context},
shorttitle = {Transformer-{{XL}}},
author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Carbonell, Jaime and Le, Quoc V. and Salakhutdinov, Ruslan},
year = {2019},
@@ -188,7 +188,7 @@ @article{futrellNaturalStoriesCorpus2020
}
@inproceedings{gauthierSyntaxGymOnlinePlatform2020,
- title = {{{SyntaxGym}}: {{An Online Platform}} for {{Targeted Evaluation}} of {{Language Models}}},
+ title = {{{SyntaxGym}}: {{An}} Online Platform for Targeted Evaluation of Language Models},
shorttitle = {{{SyntaxGym}}},
booktitle = {Proceedings of the 58th {{Annual Meeting}} of the {{Association}} for {{Computational Linguistics}}: {{System Demonstrations}}},
author = {Gauthier, Jon and Hu, Jennifer and Wilcox, Ethan and Qian, Peng and Levy, Roger},
@@ -239,7 +239,7 @@ @inproceedings{gulordava18
@techreport{hauser2018,
type = {Preprint},
- title = {Common {{Concerns}} with {{MTurk}} as a {{Participant Pool}}: {{Evidence}} and {{Solutions}}},
+ title = {Common Concerns with {{MTurk}} as a Participant Pool: Evidence and Solutions},
shorttitle = {Common {{Concerns}} with {{MTurk}} as a {{Participant Pool}}},
author = {Hauser, David and Paolacci, Gabriele and Chandler, Jesse J.},
year = {2018},
@@ -252,7 +252,7 @@ @techreport{hauser2018
}
@article{huSystematicAssessmentSyntactic2020,
- title = {A {{Systematic Assessment}} of {{Syntactic Generalization}} in {{Neural Language Models}}},
+ title = {A Systematic Assessment of Syntactic Generalization in Neural Language Models},
author = {Hu, Jennifer and Gauthier, Jon and Qian, Peng and Wilcox, Ethan and Levy, Roger P.},
year = {2020},
month = may,
@@ -282,7 +282,7 @@ @article{klieglLengthFrequencyPredictability2004
}
@article{koornneefUseVerbbasedImplicit2006,
- title = {On the Use of Verb-Based Implicit Causality in Sentence Comprehension : {{Evidence}} from Self-Paced Reading and Eye Tracking},
+ title = {On the Use of Verb-Based Implicit Causality in Sentence Comprehension: {{Evidence}} from Self-Paced Reading and Eye Tracking},
author = {Koornneef, Arnout W. and {van Berkum}, Jos J.A.},
year = {2006},
journal = {Journal of Memory and Language},
@@ -349,7 +349,7 @@ @article{peer2017
}
@article{radfordLanguageModelsAre,
- title = {Language {{Models}} Are {{Unsupervised Multitask Learners}}},
+ title = {Language Models Are Unsupervised Multitask Learners},
author = {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
pages = {24},
abstract = {Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets. We demonstrate that language models begin to learn these tasks without any explicit supervision when trained on a new dataset of millions of webpages called WebText. When conditioned on a document plus questions, the answers generated by the language model reach 55 F1 on the CoQA dataset - matching or exceeding the performance of 3 out of 4 baseline systems without using the 127,000+ training examples. The capacity of the language model is essential to the success of zero-shot task transfer and increasing it improves performance in a log-linear fashion across tasks. Our largest model, GPT-2, is a 1.5B parameter Transformer that achieves state of the art results on 7 out of 8 tested language modeling datasets in a zero-shot setting but still underfits WebText. Samples from the model reflect these improvements and contain coherent paragraphs of text. These findings suggest a promising path towards building language processing systems which learn to perform tasks from their naturally occurring demonstrations.},
@@ -360,7 +360,7 @@ @article{radfordLanguageModelsAre
@article{raynerEffectsFrequencyPredictability2004,
ids = {rayner-etal:2004},
- title = {The {{Effects}} of {{Frequency}} and {{Predictability}} on {{Eye Fixations}} in {{Reading}}: {{Implications}} for the {{E-Z Reader Model}}.},
+ title = {The Effects of Frequency and Predictability on Eye Fixations in Reading: {{Implications}} for the {{E-Z Reader}} Model.},
shorttitle = {The {{Effects}} of {{Frequency}} and {{Predictability}} on {{Eye Fixations}} in {{Reading}}},
author = {Rayner, Keith and Ashby, Jane and Pollatsek, Alexander and Reichle, Erik D.},
year = {2004},
@@ -387,7 +387,7 @@ @article{raynerEyeMovementsReading1998
}
@inproceedings{shainDeconvolutionalTimeSeries2018,
- title = {Deconvolutional {{Time Series Regression}}: {{A Technique}} for {{Modeling Temporally Diffuse Effects}}},
+ title = {Deconvolutional Time Series Regression: {{A}} Technique for Modeling Temporally Diffuse Effects},
shorttitle = {Deconvolutional {{Time Series Regression}}},
booktitle = {Proceedings of the 2018 {{Conference}} on {{Empirical Methods}} in {{Natural Language Processing}}},
author = {Shain, Cory and Schuler, William},
@@ -431,7 +431,6 @@ @inproceedings{smith-levy:2008
pages = {595–600},
booktitle = {Proceedings of the 30th Annual Meeting of the Cognitive Science Society},
author = {Smith, Nathaniel J. and Levy, Roger},
- address = {Washington, DC}
}
@article{smithEffectWordPredictability2013,
@@ -475,14 +474,13 @@ @article{traxlerProcessingSubjectObject2002
}
@article{vaniUsingInterpolatedMaze2021,
- title = {Using the {{Interpolated Maze Task}} to {{Assess Incremental Processing}} in {{English Relative Clauses}}},
+ title = {Using the Interpolated {{Maze}} Task to Assess Incremental Processing in {{English}} Relative Clauses},
author = {Vani, Pranali and Wilcox, Ethan Gotlieb and Levy, Roger},
year = {2021},
journal = {Proceedings of the Annual Meeting of the Cognitive Science Society},
volume = {43},
number = {43},
issn = {1069-7977},
- abstract = {In English, Subject Relative Clauses are processed more quickly than Object Relative Clauses, but open questions remain about where in the clause slowdown occurs. The surprisal theory of incremental processing, under which processing difficulty corresponds to probabilistic expectations about upcoming material, predicts that slowdown should occur immediately on material that disambiguates the subject from object relative clause. However, evidence from eye tracking and self-paced reading studies suggests that slowdown occurs downstream of RC-disambiguating material, on the relative clause verb. These methods, however, suffer from well-known spillover effects which makes their results difficult to interpret. To address these issues, we introduce and deploy a novel variant of the Maze task for reading times (Forster, Guerrera, \& Elliot, 2009), called the Interpolated Maze in two English web-based experiments. In Experiment 1, we find that the locus of reading-time differences between SRCs and ORCs falls on immediate disambiguating definite determiner. Experiment 2 provides a control, showing that ORCs are read more slowly than lexically-matching, non-anomalous material. These results provide new evidence for the locus of processing difficulty in relative clauses and support the surprisal theory of incremental processing.},
langid = {english},
keywords = {read},
file = {/home/vboyce/Zotero/storage/SN4J2YI6/Vani et al. - 2021 - Using the Interpolated Maze Task to Assess Increme.pdf;/home/vboyce/Zotero/storage/MR57FEX9/3x34x7dz.html}
@@ -490,7 +488,7 @@ @article{vaniUsingInterpolatedMaze2021
@article{wilcoxPredictivePowerNeural2020,
ids = {wilcoxPredictivePowerNeural},
- title = {On the {{Predictive Power}} of {{Neural Language Models}} for {{Human Real-Time Comprehension Behavior}}},
+ title = {On the Predictive Power of Neural Language Models for Human Real-Time Comprehension Behavior},
author = {Wilcox, Ethan and Gauthier, Jon and Hu, Jennifer and Qian, Peng and Levy, Roger},
year = {2020},
month = jun,
@@ -512,7 +510,6 @@ @inproceedings{wilcoxTargetedAssessmentIncremental2021
month = aug,
pages = {939--952},
publisher = {{Association for Computational Linguistics}},
- address = {{Online}},
doi = {10.18653/v1/2021.acl-long.76},
abstract = {We present a targeted, scaled-up comparison of incremental processing in humans and neural language models by collecting by-word reaction time data for sixteen different syntactic test suites across a range of structural phenomena. Human reaction time data comes from a novel online experimental paradigm called the Interpolated Maze task. We compare human reaction times to by-word probabilities for four contemporary language models, with different architectures and trained on a range of data set sizes. We find that across many phenomena, both humans and language models show increased processing difficulty in ungrammatical sentence regions with human and model `accuracy' scores a la Marvin and Linzen (2018) about equal. However, although language model outputs match humans in direction, we show that models systematically under-predict the difference in magnitude of incremental processing difficulty between grammatical and ungrammatical sentences. Specifically, when models encounter syntactic violations they fail to accurately predict the longer reading times observed in the human data. These results call into question whether contemporary language models are approaching human-like performance for sensitivity to syntactic violations.},
keywords = {read},
@@ -600,9 +597,9 @@ @incollection{mitchell:2004online-methods
booktitle = {The on-line study of sentence comprehension: Eye-tracking, ERP and beyond},
date-added = {2017-12-18 21:14:04 +0000},
date-modified = {2017-12-18 21:15:20 +0000},
- editor = {Carreiras, Manuel, and {Clifton Jr.}, Charles},
+ editor = {Manuel Carreiras and Charles {Clifton Jr.} },
pages = {15--32},
- publisher = {London: Routledge},
+ publisher = {Routledge},
title = {On-line methods in language processing: Introduction and historical review},
year = {2004}}
@@ -660,7 +657,7 @@ @incollection{mitchell:1984
date-added = {2009-10-11 15:41:36 -0700},
date-modified = {2009-10-11 15:43:55 -0700},
editor = {D. Kieras and M. A. Just},
- publisher = {Hillsdale, NJ: Earlbaum},
+ publisher = {Earlbaum},
title = {An Evaluation of Subject-Paced Reading Tasks and Other Methods for Investigating Immediate Processes in Reading},
year = {1984}}
@@ -668,7 +665,7 @@ @book{wood:2017GAMs
author = {Wood, Simon},
date-added = {2015-07-14 22:48:32 +0000},
date-modified = {2015-07-15 12:28:18 +0000},
- publisher = {CRC press},
+ publisher = {CRC},
title = {Generalized additive models: an introduction with {R}},
edition = {2},
year = {2017}}
@@ -706,7 +703,6 @@ @misc{levinson:2022-beyond-surprising
@inproceedings{hale:2001,
- address = {Pittsburgh, Pennsylvania},
author = {John Hale},
booktitle = naacl2,
date-modified = {2013-04-29 05:57:13 +0000},
@@ -738,7 +734,6 @@ @article{linzen-etal:2016tacl
@inproceedings{marvin-linzen:2018-targeted,
abstract = {We present a data set for evaluating the grammaticality of the predictions of a language model. We automatically construct a large number of minimally different pairs of English sentences, each consisting of a grammatical and an ungrammatical sentence. The sentence pairs represent different variations of structure-sensitive phenomena: subject-verb agreement, reflexive anaphora and negative polarity items. We expect a language model to assign a higher probability to the grammatical sentence than the ungrammatical one. In an experiment using this data set, an LSTM language model performed poorly on many of the constructions. Multi-task training with a syntactic objective (CCG supertagging) improved the LSTM{'}s accuracy, but a large gap remained between its performance and the accuracy of human participants recruited online. This suggests that there is considerable room for improvement over LSTMs in capturing syntax in a language model.},
- address = {Brussels, Belgium},
author = {Marvin, Rebecca and Linzen, Tal},
booktitle = emnlp2018,
date-added = {2019-05-31 17:57:21 -0400},
@@ -848,7 +843,7 @@ @incollection{levy:2013sentenceProcessing
date-modified = {2019-02-14 13:31:07 -0500},
editor = {Roger P. G. {van Gompel}},
pages = {78--114},
- publisher = {Hove: Psychology Press},
+ publisher = {Psychology Press},
rogerslocalurl = {papers/levy-2013-memory-and-surprisal-corrected.pdf},
title = {Memory and Surprisal in Human Sentence Comprehension},
topic = {Sentence processing},
@@ -862,7 +857,6 @@ @inproceedings{mccoy-etal-2019-right
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2019",
- address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P19-1334",
doi = "10.18653/v1/P19-1334",
@@ -876,7 +870,6 @@ @inproceedings{chaves-2020-dont
booktitle = "Proceedings of the Society for Computation in Linguistics 2020",
month = jan,
year = "2020",
- address = "New York, New York",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.scil-1.1",
pages = "1--11",