@inproceedings{Wang-sfr-judge,
address = {Arxiv},
author = {Peifeng Wang and Austin Xu and Yilun Zhou and Caiming Xong and Shafiq Joty},
publisher = {Arxiv},
series = {cs.CL},
title = {SFR-Judge: Direct Judgement Preference Optimization},
url = {https://arxiv.org/abs/2409.14664},
year = {2024}
}
@inproceedings{nguyen-sfr-rag,
address = {Arxiv},
author = {Xuan-Phi Nguyen and Shrey Pandit and Senthil Purushwalkam and Austin Xu and Hailin Chen and Yifei Ming and Zixuan Ke and Silvio Savarese and Caiming Xong and Shafiq Joty},
publisher = {Arxiv},
series = {cs.CL},
title = {SFR-RAG: Towards Contextually Faithful LLMs},
url = {https://arxiv.org/abs/2409.09916},
year = {2024}
}
@inproceedings{Zhenmei-gem,
address = {Arxiv},
author = {Zhenmei Shi and Yifei Ming and Xuan-Phi Nguyen and Yingyu Liang and Shafiq Joty},
publisher = {Arxiv},
series = {cs.CL},
title = {Discovering the Gems in Early Layers: Accelerating Long-Context LLMs with 1000x Input Token Reduction},
url = {https://arxiv.org/abs/2409.17422},
year = {2024}
}
@inproceedings{Chen-et-al-arxiv-24,
abstract = {Upon its release in late 2022, ChatGPT has brought a seismic shift in the entire landscape of AI, both in research and commerce. Through instruction-tuning a large language model (LLM) with supervised fine-tuning and reinforcement learning from human feedback, it showed that a model could answer human questions and follow instructions on a broad panel of tasks. Following this success, interests in LLMs have intensified, with new LLMs flourishing at frequent interval across academia and industry, including many start-ups focused on LLMs. While closed-source LLMs (e.g., OpenAI's GPT, Anthropic's Claude) generally outperform their open-source counterparts, the progress on the latter has been rapid with claims of achieving parity or even better on certain tasks. This has crucial implications not only on research but also on business. In this work, on the first anniversary of ChatGPT, we provide an exhaustive overview of this success, surveying all tasks where an open-source LLM has claimed to be on par or better than ChatGPT.},
address = {Arxiv},
author = {Hailin Chen and Fangkai Jiao and Xingxuan Li and Chengwei Qin and Mathieu Ravaut and Ruochen Zhao and Caiming Xiong and Shafiq Joty},
booktitle = {Arxiv},
publisher = {Arxiv},
series = {cs.CL},
title = {{ChatGPT's One-year Anniversary: Are Open-Source Large Language Models Catching up?}},
url = {https://arxiv.org/abs/2311.16989},
year = {2024}
}
@inproceedings{Ravaut-et-al-arxiv-24,
abstract = {With the rise of Large Language Models (LLMs) in recent years, new opportunities are emerging, but also new challenges, and contamination is quickly becoming critical. Business applications and fundraising in AI have reached a scale at which a few percentage points gained on popular question-answering benchmarks could translate into dozens of millions of dollars, placing high pressure on model integrity. At the same time, it is becoming harder and harder to keep track of the data that LLMs have seen; if not impossible with closed-source models like GPT-4 and Claude-3 not divulging any information on the training set. As a result, contamination becomes a critical issue: LLMs' performance may not be reliable anymore, as the high performance may be at least partly due to their previous exposure to the data. This limitation jeopardizes the entire progress in the field of NLP, yet, there remains a lack of methods on how to efficiently address contamination, or a clear consensus on prevention, mitigation and classification of contamination. In this paper, we survey all recent work on contamination with LLMs, and help the community track contamination levels of LLMs by releasing an open-source Python library named LLMSanitize implementing major contamination detection algorithms, which link is: this https URL.},
address = {Arxiv},
author = {Mathieu Ravaut and Bosheng Ding and Fangkai Jiao and Hailin Chen and Xingxuan Li and Ruochen Zhao and Chengwei Qin and Caiming Xiong and Shafiq Joty},
booktitle = {Arxiv},
publisher = {Arxiv},
series = {cs.CL},
title = {{How Much are LLMs Contaminated? A Comprehensive Survey and the LLMSanitize Library}},
url = {https://arxiv.org/abs/2404.00699},
year = {2024}
}
@inproceedings{Li-et-al-arxiv-24,
abstract = {The acceleration of Large Language Models (LLMs) research has opened up new possibilities for evaluating generated texts. They serve as scalable and economical evaluators, but the question of how reliable these evaluators are has emerged as a crucial research question. Prior research efforts in the meta-evaluation of LLMs as judges limit the prompting of an LLM to a single use to obtain a final evaluation decision. They then compute the agreement between LLMs' outputs and human labels. This lacks interpretability in understanding the evaluation capability of LLMs. In light of this challenge, we propose Decompose and Aggregate, which breaks down the evaluation process into different stages based on pedagogical practices. Our experiments illustrate that it not only provides a more interpretable window for how well LLMs evaluate, but also leads to improvements up to 39.6% for different LLMs on a variety of meta-evaluation benchmarks.},
address = {Arxiv},
author = {Minzhi Li and Zhengyuan Liu and Shumin Deng and Shafiq Joty and Nancy Chen and Min-Yen Kan},
booktitle = {Arxiv},
publisher = {Arxiv},
series = {cs.CL},
title = {{Decompose and Aggregate: A Step-by-Step Interpretable Evaluation Framework}},
url = {https://arxiv.org/abs/2405.15329},
year = {2024}
}
@inproceedings{Xingxuan-et-al-arxiv-24,
abstract = {Large language models (LLMs) have become the norm in natural language processing (NLP), excelling in few-shot in-context learning (ICL) with their remarkable abilities. Nonetheless, the success of ICL largely hinges on the choice of few-shot demonstration examples, making the selection process increasingly crucial. Existing methods have delved into optimizing the quantity and semantic similarity of these examples to improve ICL performances. However, our preliminary experiments indicate that the effectiveness of ICL is limited by the length of the input context. Moreover, varying combinations of few-shot demonstration examples can significantly boost accuracy across different test samples. To address this, we propose a novel method named parallel in-context learning (ParaICL) that effectively utilizes all demonstration examples without exceeding the manageable input context length. ParaICL employs parallel batching to distribute demonstration examples into different batches according to the semantic similarities of the questions in the demonstrations to the test question. It then computes normalized batch semantic scores for each batch. A weighted average semantic objective, constrained by adaptive plausibility, is applied to select the most appropriate tokens. Through extensive experiments, we validate the effectiveness of ParaICL and conduct ablation studies to underscore its design rationale. We further demonstrate that ParaICL can seamlessly integrate with existing methods.},
address = {Arxiv},
author = {Xingxuan Li and Xuan-Phi Nguyen and Shafiq Joty and Lidong Bing},
booktitle = {Arxiv},
publisher = {Arxiv},
series = {cs.CL},
title = {{ParaICL: Towards Robust Parallel In-Context Learning}},
url = {https://arxiv.org/abs/2404.00570},
year = {2024}
}
@inproceedings{Jiao-emnlp-24,
abstract = {Large Language Models (LLMs) have demonstrated significant potential in handling complex reasoning tasks through step-by-step rationale generation. However, recent studies have raised concerns regarding the hallucination and flaws in their reasoning process. Substantial efforts are being made to improve the reliability and faithfulness of the generated rationales. Some approaches model reasoning as planning, while others focus on annotating for process supervision. Nevertheless, the planning-based search process often results in high latency due to the frequent assessment of intermediate reasoning states and the extensive exploration space. Additionally, supervising the reasoning process with human annotation is costly and challenging to scale for LLM training. To address these issues, in this paper, we propose a framework to learn planning-based reasoning through Direct Preference Optimization (DPO) on collected trajectories, which are ranked according to synthesized process rewards. Our results on challenging logical reasoning benchmarks demonstrate the effectiveness of our learning framework, showing that our 7B model can surpass the strong counterparts like GPT-3.5-Turbo.},
address = {Miami, USA},
author = {Fangkai Jiao and Chengwei Qin and Zhengyuan Liu and Nancy Chen and Shafiq Joty},
booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'24},
title = {Learning Planning-based Reasoning by Trajectories Collection and Process Reward Synthesizing},
url = {https://arxiv.org/abs/2402.00658},
year = {2024}
}
@inproceedings{Islam-emnlp-24,
abstract = {Data-driven storytelling is a powerful method for conveying insights by combining narrative techniques with visualizations and text. These stories integrate visual aids, such as highlighted bars and lines in charts, along with textual annotations explaining insights. However, creating such stories requires a deep understanding of the data and meticulous narrative planning, often necessitating human intervention, which can be time-consuming and mentally taxing. While Large Language Models (LLMs) excel in various NLP tasks, their ability to generate coherent and comprehensive data stories remains underexplored. In this work, we introduce a novel task for data story generation and a benchmark containing 1,449 stories from diverse sources. To address the challenges of crafting coherent data stories, we propose a multi-agent framework employing two LLM agents designed to replicate the human storytelling process: one for understanding and describing the data (Reflection), generating the outline, and narration, and another for verification at each intermediary step. While our agentic framework generally outperforms non-agentic counterparts in both model-based and human evaluations, the results also reveal unique challenges in data story generation.},
address = {Miami, USA},
author = {Mohammed Islam and Md Laskar and Md Parvez and Enamul Hoque and Shafiq Joty},
booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'24},
title = {DataNarrative: Automated Data-Driven Storytelling with Visualizations and Texts},
url = {https://arxiv.org/abs/2408.05346},
year = {2024}
}
@inproceedings{Xingxuan-emnlp-24,
abstract = {In this work, we designed unbiased prompts to systematically evaluate the psychological safety of large language models (LLMs). First, we tested five different LLMs by using two personality tests: Short Dark Triad (SD-3) and Big Five Inventory (BFI). All models scored higher than the human average on SD-3, suggesting a relatively darker personality pattern. Despite being instruction fine-tuned with safety metrics to reduce toxicity, InstructGPT, GPT-3.5, and GPT-4 still showed dark personality patterns; these models scored higher than self-supervised GPT-3 on the Machiavellianism and narcissism traits on SD-3. Then, we evaluated the LLMs in the GPT series by using well-being tests to study the impact of fine-tuning with more training data. We observed a continuous increase in the well-being scores of GPT models. Following these observations, we showed that fine-tuning Llama-2-chat-7B with responses from BFI using direct preference optimization could effectively reduce the psychological toxicity of the model. Based on the findings, we recommended the application of systematic and comprehensive psychological metrics to further evaluate and improve the safety of LLMs.},
address = {Miami, USA},
author = {Xingxuan Li and Yutong Li and Lin Qiu and Shafiq Joty and Lidong Bing},
booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'24},
title = {Evaluating Psychological Safety of Large Language Models},
url = {https://arxiv.org/abs/2212.10529},
year = {2024}
}
@inproceedings{Han-et-al-arxiv-24,
abstract = {Large language models (LLMs) have achieved remarkable performance on a variety of natural language understanding tasks. However, existing benchmarks are inadequate in measuring the complex logical reasoning capabilities of a model. We present FOLIO, a human-annotated, logically complex and diverse dataset for reasoning in natural language (NL), equipped with first-order logic (FOL) annotations. FOLIO consists of 1,430 examples (unique conclusions), each paired with one of 487 sets of premises used to deductively reason for the validity of each conclusion. The logical correctness of the premises and conclusions is ensured by their FOL annotations, which are automatically verified by an FOL inference engine. In addition to the main NL reasoning task, NL-FOL pairs in FOLIO constitute a new NL-FOL translation dataset. Our experiments on FOLIO systematically evaluate the FOL reasoning ability of supervised fine-tuning on medium-sized language models. For both NL reasoning and NL-FOL translation, we benchmark multiple state-of-the-art language models. Our results show that a subset of FOLIO presents a challenge for one of the most capable {Large Language Model (LLM)} publicly available, GPT-4.},
address = {Miami, USA},
author = {SIMENG HAN and Hailey Schoelkopf and Yilun Zhao and Zhenting Qi and Martin Riddell and Wenfei Zhou and James Coady and David Peng and Yujie Qiao and Luke Benson and Lucy Sun and Alexander Wardle-Solano and Hannah Szabó and Ekaterina Zubova and Matthew Burtell and Jonathan Fan and Yixin Liu and Brian Wong and Malcolm Sailor and Ansong Ni and Linyong Nan and Jungo Kasai and Tao Yu and Rui Zhang and Alexander Fabbri and Wojciech Maciej Kryscinski and Semih Yavuz and Ye Liu and Victoria Lin and Shafiq Joty and Yingbo Zhou and Caiming Xiong and Rex Ying and Arman Cohan and Dragomir Radev},
booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'24},
title = {{FOLIO: Natural Language Reasoning with First-Order Logic}},
url = {https://arxiv.org/abs/2209.00840},
year = {2024}
}
@inproceedings{Laskar-emnlp-24,
abstract = {Large Language Models (LLMs) have recently gained significant attention due to their remarkable capabilities in performing diverse tasks across various domains. However, a thorough evaluation of these models is crucial before deploying them in real-world applications to ensure they produce reliable performance. Despite the well-established importance of evaluating LLMs in the community, the complexity of the evaluation process has led to varied evaluation setups, causing inconsistencies in findings and interpretations. To address this, we systematically review the primary challenges and limitations causing these inconsistencies and unreliable evaluations in various steps of LLM evaluation. Based on our critical review, we present our perspectives and recommendations to ensure LLM evaluations are reproducible, reliable, and robust.},
address = {Miami, USA},
author = {Md Laskar and Sawsan Alqahtani and M Bari and Mizanur Rahman and Mohammad Khan and Haidar Khan and Israt Jahan and Amran Bhuiyan and Chee Tan and Md Parvez and Enamul Hoque and Shafiq Joty and Jimmy Huang},
booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'24},
title = {A Systematic Survey and Critical Review on Evaluating Large Language Models: Challenges, Limitations, and Recommendations},
url = {https://arxiv.org/html/2407.04069v1},
year = {2024}
}
@inproceedings{Shayekh-emnlp-24,
abstract = {Retrieval Augmented Generation (RAG) has been shown to enhance the factual accuracy of Large Language Models (LLMs) by providing external evidence, but existing methods often suffer from limited reasoning capabilities (e.g., multi-hop complexities) in effectively using such evidence, particularly when using open-source LLMs. To mitigate this gap, in this paper, we introduce a novel framework, Open-RAG, designed to enhance reasoning capabilities in RAG with open-source LLMs. Our framework transforms an arbitrary dense LLM into a parameter-efficient sparse mixture of experts (MoE) model capable of handling complex reasoning tasks, including both single- and multi-hop queries. Open-RAG uniquely trains the model to navigate challenging distractors that appear relevant but are misleading. By combining the constructive learning and architectural transformation, Open-RAG leverages latent learning, dynamically selecting relevant experts and integrating external knowledge effectively for more accurate and contextually relevant responses. Additionally, we propose a hybrid adaptive retrieval method to determine retrieval necessity and balance the trade-off between performance gain and inference speed. Experimental results show that Open-RAG outperforms state-of-the-art LLMs and RAG models in various knowledge-intensive tasks. Our method based on Llama2-7B sets new benchmarks, surpassing ChatGPT-RAG and Self-RAG. For example, in multi-hop HotpotQA, it achieves an EM score of 63.3, compared to RAG 2.0's 54 and Command R+'s 60.},
address = {Miami, USA},
author = {Shayekh Islam and Md Rahman and K Hossain and Enamul Hoque and Shafiq Joty and Md Parvez},
booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'24 Findings},
title = {{Open-RAG: Enhanced Retrieval Augmented Reasoning with Open-Source Large Language Models}},
url = {},
year = {2024}
}
@inproceedings{Divyansh-emnlp-24,
abstract = {Prompt leakage poses a compelling security and privacy threat in LLM applications. Leakage of system prompts may compromise intellectual property, and act as adversarial reconnaissance for an attacker. A systematic evaluation of prompt leakage threats and mitigation strategies is lacking, especially for multi-turn LLM interactions. In this paper, we systematically investigate LLM vulnerabilities against prompt leakage for 10 closed- and open-source LLMs, across four domains. We design a unique threat model which leverages the LLM sycophancy effect and elevates the average attack success rate (ASR) from 17.7% to 86.2% in a multi-turn setting. Our standardized setup further allows dissecting leakage of specific prompt contents such as task instructions and knowledge documents. We measure the mitigation effect of 7 black-box defense strategies, along with finetuning an open-source model to defend against leakage attempts. We present different combination of defenses against our threat model, including a cost analysis. Our study highlights key takeaways for building secure LLM applications and provides directions for research in multi-turn LLM interactions.},
address = {Miami, USA},
author = {Divyansh Agarwal and Alexander Fabbri and Philippe Laban and Shafiq Joty and Caiming Xiong and Chien-Sheng Wu},
booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'24 Industry Track},
title = {{Investigating the prompt leakage effect and black-box defenses for multi-turn LLM interactions}},
url = {https://arxiv.org/abs/2404.16251},
year = {2024}
}
@inproceedings{Han-et-al-arxiv-24,
abstract = {Existing methods on understanding the capabilities of LLMs in logical reasoning rely on binary entailment classification or synthetically derived rationales, which are not sufficient for properly assessing model's capabilities. We present \textit{P-FOLIO}, a human-annotated dataset consisting of diverse and complex reasoning chains for a set of realistic logical reasoning stories also written by humans. P-FOLIO is collected with an annotation protocol that facilitates humans to annotate well-structured natural language proofs for first-order logic reasoning problems in a step-by-step manner. The number of reasoning steps in P-FOLIO span from 0 to 20. We further use P-FOLIO to evaluate and improve large-language-model (LLM) reasoning capabilities. We evaluate LLM reasoning capabilities at a fine granularity via single-step inference rule classification, with more diverse inference rules of more diverse and higher levels of complexities than previous works. Given that a single model-generated reasoning chain could take a completely different path than the human-annotated one, we sample multiple reasoning chains from a model and use pass@k metrics for evaluating the quality of model-generated reasoning chains. We show that human-written reasoning chains significantly boost the logical reasoning capabilities of LLMs via many-shot prompting and fine-tuning. Furthermore, fine-tuning Llam3-7B on P-FOLIO improves the model performance by 10% or more on three other out-of-domain logical reasoning datasets.},
address = {Miami, USA},
author = {SIMENG HAN and Aaron Yu and Rui Shen and Zhenting Qi and Martin Riddell and Wenfei Zhou and Yujie Qiao and Yilun Zhao and Semih Yavuz and Ye Liu and Shafiq Joty and Yingbo Zhou and Caiming Xiong and Rex Ying and Arman Cohan and Dragomir Radev},
booktitle = {Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'24 Findings},
title = {{P-FOLIO: Evaluating and Improving Logical Reasoning with Abundant Human-Written Reasoning Chains}},
url = {https://arxiv.org/abs/2209.00840},
year = {2024}
}
@inproceedings{Artemis-et-al-eccv-24,
abstract = {Vision-language pre-training and instruction tuning have demonstrated general-purpose capabilities in 2D visual reasoning tasks by aligning visual encoders with state-of-the-art large language models (LLMs). In this paper, we introduce a simple, yet effective, cross-modality framework built atop frozen LLMs that allows the integration of various modalities without extensive modality-specific customization. To facilitate instruction-modality fine-tuning, we collect high-quality instruction tuning data in an automatic and scalable manner, composed of 24K QA samples for audio and 250K QA samples for 3D. Leveraging instruction-aware representations, our model performs comparably with leading-edge counterparts without the need of extensive modality-specific pre-training or customization. Furthermore, our approach demonstrates cross-modal reasoning abilities across two or more input modalities, despite each modality projection being trained individually. To study the model's cross-modal abilities, we contribute a novel Discriminative Cross-modal Reasoning (DisCRn) evaluation task, comprising 9K audio-video QA samples and 28K image-3D QA samples that require the model to reason discriminatively across disparate input modalities.},
address = {Milan, Italy},
author = {Artemis Panagopoulou and Le Xue and Ning Yu and Junnan Li and Dongxu Li and Shafiq Joty and Ran Xu and Silvio Savarese and Caiming Xiong and Juan-Carlos Niebles},
booktitle = {2024 European Conference on Computer Vision},
series = {ECCV'24},
title = {X-InstructBLIP: A Framework for aligning X-Modal instruction-aware representations to LLMs and Emergent Cross-modal Reasoning},
url = {https://artemisp.github.io/X-InstructBLIP-page/},
year = {2024}
}
@inproceedings{Khan-et-al-acl-24,
abstract = {Recently, pre-trained large language models (LLMs) have shown impressive abilities in generating codes from natural language descriptions, repairing buggy codes, translating codes between languages, and retrieving relevant code segments. However, the evaluation of these models has often been performed in a scattered way on only one or two specific tasks, in a few languages, at a partial granularity (e.g., function) level, and in many cases without proper training data. Even more concerning is that in most cases the evaluation of generated codes has been done in terms of mere lexical overlap with a reference code rather than actual execution. We introduce *xCodeEval*, the largest executable multilingual multitask benchmark to date consisting of $25$M document-level coding examples ($16.5$B tokens) from about $7.5$K unique problems covering up to $11$ programming languages with execution-level parallelism. It features a total of $7$ tasks involving code understanding, generation, translation and retrieval. *xCodeEval* adopts an execution-based evaluation and offers a multilingual code execution engine, \texttt{ExecEval} that supports unit test based execution in all the $11$ languages. To address the challenge of balancing the distributions of text-code samples over multiple attributes in validation/test sets, we propose a novel data splitting and a data selection schema based on the geometric mean and graph-theoretic principle. Our experiments with OpenAI's LLMs (zero-shot) and open-LLMs (zero-shot and fine-tuned) on the tasks and languages demonstrate to be quite challenging as per the current advancements in language models.},
address = {Bangkok, Thailand},
author = {Mohammad Khan and M Bari and Do Long and Weishi Wang and Md Parvez and Shafiq Joty},
booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'24},
title = {XCodeEval: An Execution-based Large Scale Multilingual Multitask Benchmark for Code Understanding, Generation, Translation and Retrieval},
url = {https://arxiv.org/abs/2303.03004},
year = {2024}
}
@inproceedings{Ravaut-acl-24,
abstract = {Large language models (LLMs) excel in abstractive summarization tasks, delivering fluent and pertinent summaries. Recent advancements have extended their capabilities to handle long-input contexts, exceeding 100k tokens. However, in question answering, language models exhibit uneven utilization of their input context. They tend to favor the initial and final segments, resulting in a U-shaped performance pattern concerning where the answer is located within the input. This bias raises concerns, particularly in summarization where crucial content may be dispersed throughout the source document(s). Besides, in summarization, mapping facts from the source to the summary is not trivial as salient content is usually re-phrased. In this paper, we conduct the first comprehensive study on context utilization and position bias in summarization. Our analysis encompasses 5 LLMs, 10 datasets, and 5 evaluation metrics. We introduce a new evaluation benchmark called MiddleSum on the which we benchmark two alternative inference methods to alleviate position bias: hierarchical summarization and incremental summarization.},
address = {Bangkok, Thailand},
author = {Mathieu Ravaut and Aixin Sun and Nancy Chen and Shafiq Joty},
booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'24},
title = {On Context Utilization in Summarization with Large Language Models},
url = {https://arxiv.org/pdf/2310.10570v3},
year = {2024}
}
@inproceedings{Phi-et-al-acl-24,
abstract = {Large language models (LLMs) are known to effectively perform tasks by simply observing few exemplars. However, in low-resource languages, obtaining such hand-picked exemplars can still be challenging, where unsupervised techniques may be necessary. Moreover, competent generative capabilities of LLMs are observed only in high-resource languages, while their performances among under-represented languages fall behind due to pre-training data imbalance. To elicit LLMs' ability onto low-resource languages without any supervised data, we propose to assemble synthetic exemplars from a diverse set of high-resource languages to prompt the LLMs to translate from any language into English. These prompts are then used to create intra-lingual exemplars to perform tasks in the target languages. Our unsupervised prompting method performs on par with supervised few-shot learning in LLMs of different sizes for translations between English and 13 Indic and 21 African low-resource languages. We also show that fine-tuning a 7B model on data generated from our method helps it perform competitively with a 175B model. In non-English translation tasks, our method even outperforms supervised prompting by up to 3 chrF++ in many low-resource languages. When evaluated on zero-shot multilingual summarization, our method surpasses other English-pivoting baselines by up to 4 ROUGE-L and is also favored by GPT-4.},
address = {Bangkok, Thailand},
author = {Xuan-Phi Nguyen and Mahani Aljunied and Shafiq Joty and Lidong Bing},
booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'24},
title = {Democratizing LLMs for Low-Resource Languages by Leveraging their English Dominant Abilities with Linguistically-Diverse Prompts},
url = {https://arxiv.org/abs/2306.11372},
year = {2024}
}
@inproceedings{Masry-et-al-acl-24,
abstract = {Charts provide visual representations of data and are widely used for analyzing information, addressing queries, and conveying insights to others. Various chart-related downstream tasks have emerged recently, such as question-answering and summarization. A common strategy to solve these tasks is to fine-tune various models originally trained on vision tasks language. However, such task-specific models are not capable of solving a wide range of chart-related tasks, constraining their real-world applicability. To overcome these challenges, we introduce ChartInsruct: a novel chart-specific vision-language Instruction-following dataset comprising 191K instructions generated with 71K charts. We then present two distinct systems for instruction tuning on such datasets: (1) an end-to-end model that connects a vision encoder for chart understanding with a LLM; and (2) a pipeline model that employs a two-step approach to extract chart data tables and input them into the LLM. In experiments on four downstream tasks, we first show the effectiveness of our model--achieving a new set of state-of-the-art results. Further evaluation shows that our instruction-tuning approach supports a wide array of real-world chart comprehension and reasoning scenarios, thereby expanding the scope and applicability of our models to new kinds of tasks.},
address = {Bangkok, Thailand},
author = {Ahmed Masry and Mehrad Shahmohammadi and Md Parvez and Enamul Hoque and Shafiq Joty},
booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'24 Findings},
title = {ChartInstruct: Instruction Tuning for Chart Comprehension and Reasoning},
url = {https://arxiv.org/pdf/2403.09028},
year = {2024}
}
@inproceedings{Ding-et-al-acl-24,
abstract = {In the rapidly evolving field of machine learning (ML), data augmentation (DA) has emerged as a pivotal technique for enhancing model performance by diversifying training examples without the need for additional data collection. This survey explores the transformative impact of Large Language Models (LLMs) on DA, particularly addressing the unique challenges and opportunities they present in the context of natural language processing (NLP) and beyond. We provide a comprehensive overview of methods leveraging LLMs for DA, including a novel exploration of learning paradigms where LLM-generated data is used for further training, thus enhancing model robustness and performance. Additionally, this paper delineates the primary challenges faced in this domain, ranging from controllable data augmentation to multi modal data augmentation. This survey highlights the paradigm shift introduced by LLMs in DA, aims to serve as a foundational guide for researchers and practitioners in this field.},
address = {Bangkok, Thailand},
author = {Bosheng Ding and Chengwei Qin and Ruochen Zhao and Tianze Luo and Xinze Li and Guizhen Chen and Wenhan Xia and Junjie Hu and Anh-Tuan Luu and Shafiq Joty},
booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'24 Findings},
title = {Data Augmentation using LLMs: Methods, Learning Paradigms and Challenges},
url = {https://arxiv.org/abs/2403.02990},
year = {2024}
}
@inproceedings{Shohan-et-al-acl-24,
abstract = {Millions of news articles published online daily can overwhelm readers. Headlines and tags are essential for guiding readers to decide if the content is worth their time. While headline generation has been extensively studied, tag generation remains largely unexplored, yet it offers readers better access to topics of interest. The need for conciseness in capturing readers' attention necessitates improved content selection strategies for identifying salient and relevant segments within lengthy articles, thereby guiding language models effectively. To address this, we leverage auxiliary information such as images and captions embedded in the articles to retrieve sentences and utilize instruction tuning with variations to generate both headlines and tags for news articles in a multilingual context. To make use of the auxiliary information, we have compiled a dataset named XL-HeadTags, which includes 20 languages across 6 diverse language families. Through extensive evaluation, we demonstrate the effectiveness of our plug-and-play multimodal retrievers for both tasks. Additionally, we have developed a suite of tools for processing and evaluating multilingual texts, significantly contributing to the research community by enabling more accurate and efficient analysis across languages.},
address = {Bangkok, Thailand},
author = {Faisal Shohan and Mir Nayeem and Samsul Islam and Abu Akash and Shafiq Joty},
booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'24 Findings},
title = {XL-HeadTags: Leveraging Multimodal Retrieval Augmentation for the Multilingual Generation of News Headlines and Tags},
url = {},
year = {2024}
}
@inproceedings{Huang-et-al-NAACL-24,
abstract = {Previous research in multi-document news summarization has typically concentrated on collating information that all sources agree upon. However, to our knowledge, the summarization of diverse information dispersed across multiple articles about an event has not been previously investigated. The latter imposes a different set of challenges for a summarization model. In this paper, we propose a new task of summarizing diverse information encountered in multiple news articles encompassing the same event. To facilitate this task, we outlined a data collection schema for identifying diverse information and curated a dataset named DiverseSumm. The dataset includes 245 news stories, with each story comprising 10 news articles and paired with a human-validated reference. Moreover, we conducted a comprehensive analysis to pinpoint the position and verbosity biases when utilizing Large Language Model (LLM)-based metrics for evaluating the coverage and faithfulness of the summaries, as well as their correlation with human assessments. We applied our findings to study how LLMs summarize multiple news articles by analyzing which type of diverse information LLMs are capable of identifying. Our analyses suggest that despite the extraordinary capabilities of LLMs in single-document summarization, the proposed task remains a complex challenge for them mainly due to their limited coverage, with GPT-4 only able to cover less than 40% of the diverse information on average.},
address = {Mexico City, Mexico},
author = {Kung-Hsiang Huang and Philippe Laban and Alexander Fabbri and Prafulla Kumar Choubey and Shafiq Joty and Caiming Xiong and Chien-Sheng Wu},
booktitle = {2024 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
issue = {},
pages = {},
series = {NAACL-24},
title = {{Embrace Divergence for Richer Insights: A Multi-document Summarization Benchmark and a Case Study on Summarizing Diverse Information from News Articles}},
url = {https://arxiv.org/abs/2309.09369},
year = {2024}
}
@inproceedings{Fangkai-et-al-NAACL-24,
abstract = {Traditional attempts to enhance the logical reasoning abilities of language models often rely on supervised fine-tuning, limiting their generalization to new tasks or domains. Large Language Models (LLMs), with their capacity to condense vast knowledge, can effectively tackle many tasks. Yet, our experiments reveal a gap in their performance on logical reasoning benchmarks when compared to state-of-the-art fine-tuning based models. To bridge this gap, we present LogicLLM, a first-of-its-kind, fully self-supervised framework for integrating logical reasoning capabilities into LLMs, and activating them via in-context learning. We apply this to two LLM series, FLAN-T5 and LLaMA, with parameter sizes from 3 billion to 33 billion. LogicLLM demonstrates its effectiveness through successful improvements on two logical reasoning benchmarks (ReClor and LogiQA-v2). Additionally, LogicLLM based on FLAN-T5-11B attains comparable results to ChatGPT, and evaluations with LLaMA-based models on three language understanding benchmarks (RACE, MMLU and Big-Bench-Hard) confirm that the improvements come without compromising the model's general language understanding capabilities.},
address = {Mexico City, Mexico},
author = {Fangkai Jiao and Zhiyang Teng and Bosheng Ding and Zhengyuan Liu and Nancy Chen and Shafiq Joty},
booktitle = {2024 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
issue = {},
pages = {},
series = {NAACL-24},
title = {{Exploring Self-supervised Logic-enhanced Training for Large Language Models}},
url = {https://arxiv.org/abs/2305.13718},
year = {2024}
}
@inproceedings{Chengwei-et-al-NAACL-24,
abstract = {To mitigate forgetting, existing lifelong event detection methods typically maintain a memory module and replay the stored memory data during the learning of a new task. However, the simple combination of memory data and new-task samples can still result in substantial forgetting of previously acquired knowledge, which may occur due to the potential overlap between the feature distribution of new data and the previously learned embedding space. Moreover, the model suffers from overfitting on the few memory samples rather than effectively remembering learned patterns. To address the challenges of forgetting and overfitting, we propose a novel method based on embedding space separation and compaction. Our method alleviates forgetting of previously learned tasks by forcing the feature distribution of new data away from the previous embedding space. It also mitigates overfitting by a memory calibration mechanism that encourages memory data to be close to its prototype to enhance intra-class compactness. In addition, the learnable parameters of the new task are initialized by drawing upon acquired knowledge from the previously learned task to facilitate forward knowledge transfer. With extensive experiments, we demonstrate that our method can significantly outperform previous state-of-the-art approaches.},
address = {Mexico City, Mexico},
author = {Chengwei Qin and Ruirui Chen and Ruochen Zhao and Wenhan Xia and Shafiq Joty},
booktitle = {2024 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
issue = {},
pages = {},
series = {NAACL-24},
title = {{Lifelong Event Detection with Embedding Space Separation and Compaction}},
url = {https://openreview.net/forum?id=QL69qAZgTnx},
year = {2024}
}
@inproceedings{Yixin-et-al-NAACL-24,
abstract = {While large language models (LLMs) can already achieve strong performance on standard generic summarization benchmarks, their performance on more complex summarization task settings is less studied. Therefore, we benchmark LLMs on instruction controllable text summarization, where the model input consists of both a source article and a natural language requirement for desired summary characteristics. To this end, we curate an evaluation-only dataset for this task setting and conduct human evaluation on 5 LLM-based summarization systems. We then benchmark LLM-based automatic evaluation for this task with 4 different evaluation protocols and 11 LLMs, resulting in 40 evaluation methods in total. Our study reveals that instruction controllable text summarization remains a challenging task for LLMs, since (1) all LLMs evaluated still make factual and other types of errors in their summaries; (2) no LLM-based evaluation methods can achieve a strong alignment with human annotators when judging the quality of candidate summaries; (3) different LLMs show large performance gaps in summary generation and evaluation capabilities. We make our collected benchmark InstruSum publicly available to facilitate future research in this direction.},
address = {Mexico City, Mexico},
author = {Yixin Liu and Alexander Fabbri and Jiawen Chen and Yilun Zhao and SIMENG HAN and Shafiq Joty and Pengfei Liu and Dragomir Radev and Chien-Sheng Wu and Arman Cohan},
booktitle = {2024 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
issue = {},
pages = {},
series = {NAACL-24 Findings},
title = {{Benchmarking Generation and Evaluation Capabilities of Large Language Models for Instruction Controllable Summarization}},
url = {https://arxiv.org/pdf/2311.09184.pdf},
year = {2024}
}
@inproceedings{Sawhney-et-al-EACL-24,
abstract = {The Euclidean space is the familiar space for training neural models and performing arithmetic operations.
However, many data types inherently possess complex geometries, and model training methods involve operating over their latent representations, which cannot be effectively captured in the Euclidean space. The hyperbolic space provides a more generalized representative geometry to model the hierarchical complexities of the tree-like structure of natural language. We propose AdaPT a set of guidelines for initialization, parametrization, and training of neural networks, which adapts to the dataset and can be used with different manifolds. AdaPT can be generalized over any existing neural network training methodology and leads to more stable training without a substantial increase in training time. We apply AdaPT guidelines over two state-of-the-art deep learning approaches and empirically demonstrate its effectiveness through experiments on three tasks over 12 languages across speech and text. Through extensive qualitative analysis, we put forward the applicability of AdaPT as a set of guidelines optimally utilizing the manifold geometry, which can be extended to various downstream tasks across languages and modalities.},
address = {Mexico City, Mexico},
author = {Ramit Sawhney and Megh Thakkar and Vishwa Shah and Shrey Pandit and Shafiq Joty},
booktitle = {2024 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
issue = {},
pages = {},
series = {NAACL-24 Findings},
title = {{AdaPT: A Set of Guidelines for Hyperbolic Multimodal Multilingual NLP}},
url = {https://openreview.net/forum?id=SO_l8Jsa9kc},
year = {2024}
}
@inproceedings{Wenting-et-al-NAACL-24,
abstract = {Large Language Models (LLMs) have exhibited impressive generation capabilities, but they suffer from hallucinations when solely relying on their internal knowledge, especially when answering questions that require less commonly known information. Retrieval-augmented LLMs have emerged as a potential solution to ground LLMs in external knowledge. Nonetheless, recent approaches have primarily emphasized retrieval from unstructured text corpora, owing to its seamless integration into prompts. When using structured data such as knowledge graphs, most methods simplify it into natural text, neglecting the underlying structures. Moreover, a significant gap in the current landscape is the absence of a realistic benchmark for evaluating the effectiveness of grounding LLMs on heterogeneous knowledge sources (e.g., knowledge base and text). To fill this gap, we have curated a comprehensive dataset that poses two unique challenges: (1) Two-hop multi-source questions that require retrieving information from both open-domain structured and unstructured knowledge sources; retrieving information from structured knowledge sources is a critical component in correctly answering the questions. (2) Generation of symbolic queries (e.g., SPARQL for Wikidata) is a key requirement, which adds another layer of challenge. Our dataset is created using a combination of automatic generation
through predefined reasoning chains and human annotation. We also introduce a novel approach that leverages multiple retrieval tools, including text passage retrieval and symbolic language-assisted retrieval. Our model outperforms previous approaches by a significant margin, demonstrating its effectiveness in addressing the above-mentioned reasoning challenges.},
address = {Mexico City, Mexico},
author = {Wenting Zhao and Ye Liu and Tong Niu and Yao Wan and Philip Yu and Shafiq Joty and Yingbo Zhou and Semih Yavuz},
booktitle = {2024 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
issue = {},
pages = {},
series = {NAACL-24 Findings},
title = {{DIVKNOWQA: Assessing the Reasoning Ability of LLMs via Open-Domain Question Answering over Knowledge Base and Text}},
url = {https://arxiv.org/abs/2310.20170},
year = {2024}
}
@inproceedings{Bram-et-al-CVPR-24,
abstract = {Large language models (LLMs) are fine-tuned using human comparison data with Reinforcement Learning from Human Feedback (RLHF) methods to make them better aligned with users' preferences. In contrast to LLMs, human preference learning has not been widely explored in text-to-image diffusion models; the best existing approach is to fine-tune a pretrained model using carefully curated high quality images and captions to improve visual appeal and text alignment. We propose Diffusion-DPO, a method to align diffusion models to human preferences by directly optimizing on human comparison data. Diffusion-DPO is adapted from the recently developed Direct Preference Optimization (DPO), a simpler alternative to RLHF which directly optimizes a policy that best satisfies human preferences under a classification objective. We re-formulate DPO to account for a diffusion model notion of likelihood, utilizing the evidence lower bound to derive a differentiable objective. Using the Pick-a-Pic dataset of 851K crowdsourced pairwise preferences, we fine-tune the base model of the state-of-the-art Stable Diffusion XL (SDXL)-1.0 model with Diffusion-DPO. Our fine-tuned base model significantly outperforms both base SDXL-1.0 and the larger SDXL-1.0 model consisting of an additional refinement model in human evaluation, improving visual appeal and prompt alignment. We also develop a variant that uses AI feedback and has comparable performance to training on human preferences, opening the door for scaling of diffusion model alignment methods.},
address = {Seattle, USA},
author = {Bram Wallace and Meihua Dang and Rafael Rafailov and Linqi Zhou and Aaron Lou and Senthil Purushwalkam and Stefano Ermon and Caiming Xiong and Shafiq Joty and Nikhil Naik },
booktitle = {International Conference on Computer Vision and Pattern Recognition},
issue = {},
pages = {},
series = {CVPR-24},
title = {{Diffusion Model Alignment Using Direct Preference Optimization}},
url = {https://arxiv.org/abs/2311.12908},
year = {2024}
}
@inproceedings{Le-et-al-ICLR-24,
abstract = {Large Language Models (LLMs) have already become quite proficient at solving simpler programming tasks like those in HumanEval or MBPP benchmarks.
However, solving more complex and competitive programming tasks is still quite challenging for these models - possibly due to their tendency to generate solutions as monolithic code blocks instead of decomposing them into logical sub-tasks and sub-modules. On the other hand, experienced programmers instinctively write modularized code with abstraction for solving complex tasks, often reusing previously developed modules. To address this gap, we propose CodeChain, a novel framework for inference that elicits modularized code generation through a chain of self-revisions, each being guided by some representative sub-modules generated in previous iterations. Concretely, CodeChain first instructs the LLM to generate modularized codes through chain-of-thought prompting. Then it applies a chain of self-revisions by iterating the two steps: 1) extracting and clustering the generated sub-modules and selecting the cluster representatives as the more generic and re-usable implementations, and 2) augmenting the original chain-of-thought prompt with these selected module-implementations and instructing the LLM to re-generate new modularized solutions. We find that by naturally encouraging the LLM to reuse the previously developed and verified sub-modules, CodeChain can significantly boost both modularity as well as correctness of the generated solutions, achieving relative pass@1 improvements of 35% on APPS and 76% on CodeContests. It is shown to be effective on both OpenAI LLMs as well as opensourced LLMs like WizardCoder. We also conduct comprehensive ablation studies with different methods of prompting, number of clusters, model sizes, program qualities, etc., to provide useful insights that underpin CodeChain’s success.},
address = {Vienna, Austria},
author = {Hung Le and Hailin Chen and Amrita Saha and Akash Gokul and Doyen Sahoo and Shafiq Joty},
booktitle = {International Conference on Learning Representations},
issue = {},
pages = {},
series = {ICLR-24},
title = {{CodeChain: Towards Modular Code Generation Through Chain of Self-revisions with Representative Sub-modules}},
url = {https://openreview.net/pdf?id=vYhglxSj8j},
year = {2024}
}
@inproceedings{Li-et-al-ICLR-24,
abstract = {We present chain-of-knowledge (CoK), a novel framework that augments large language models (LLMs) by dynamically incorporating grounding information from heterogeneous sources. It results in more factual rationales and reduced hallucination in generation. Specifically, CoK consists of three stages: reasoning preparation, dynamic knowledge adapting, and answer consolidation. Given a knowledge-intensive question, CoK first prepares several preliminary rationales and answers while identifying the relevant knowledge domains. If there is no majority consensus among the answers from samples, CoK corrects the rationales step by step by adapting knowledge from the identified domains. These corrected rationales can plausibly serve as a better foundation for the final answer consolidation. Unlike prior studies that primarily use unstructured data, CoK also leverages structured knowledge sources such as Wikidata and tables that provide more reliable factual information. To access both unstructured and structured knowledge sources in the dynamic knowledge adapting stage, we propose an adaptive query generator that allows the generation of queries for various types of query languages, including SPARQL, SQL, and natural sentences. Moreover, to minimize error propagation between rationales, CoK corrects the rationales progressively using preceding corrected rationales to generate and correct subsequent rationales. Extensive experiments show that CoK consistently improves the performance of LLMs on knowledge-intensive tasks across different domains.},
address = {Vienna, Austria},
author = {Xingxuan Li and Ruochen Zhao and Yew Ken Chia and Bosheng Ding and Shafiq Joty and Soujanya Poria and Lidong Bing},
booktitle = {International Conference on Learning Representations},
issue = {},
pages = {},
series = {ICLR-24},
title = {{Chain of Knowledge: A Framework for Grounding Large Language Models with Structured Knowledge Bases}},
url = {https://openreview.net/pdf?id=cPgh4gWZlz},
year = {2024}
}
@article{Ni-et-al-TACL,
abstract = {Recently, large language models (LLMs),
especially those that are pretrained on code, have demonstrated strong capabilities in generating programs from natural language inputs in a few-shot or even zero-shot manner. Despite promising results, there is a notable lack of a comprehensive evaluation of these models’ language-to-code generation capabilities. Existing studies often focus on specific tasks, model architectures, or learning paradigms, leading to a fragmented understanding of the overall landscape. In this work, we present L2CEval, a systematic evaluation of the language-tocode generation capabilities of LLMs on 7 tasks across the domain spectrum of semantic parsing, math reasoning and Python programming, analyzing the factors that potentially affect their performance, such as model size, pretraining data, instruction tuning, and different prompting methods. In addition to assessing model performance, we measure confidence calibration for the models and conduct human evaluations of the output programs. This enables us to identify and analyze the typical failure modes across various tasks and models. L2CEval offers a comprehensive understanding of the capabilities and limitations of LLMs in language-to-code generation. We also release the evaluation framework1 and all model outputs, hoping to lay the groundwork for further future research in this domain.},
author = {Ansong Ni and Pengcheng Yin and Yilun Zhao and Martin Riddell and Troy Feng and Rui Shen and Stephen Yin and Ye Liu and Semih Yavuz and Caiming Xiong and Shafiq Joty and Yingbo Zhou and Dragomir Radev and Arman Cohan},
journal = {Transactions of ACL (TACL)},
series = {TACL},
title = {{L2CEval: Evaluating Language-to-Code Generation Capabilities of Large Language Models}},
url = {https://arxiv.org/pdf/2309.17446v2.pdf},
year = {2024}
}
@article{Wang-et-al-tkde,
abstract = {Conversational Recommender Systems (CRS) has become an emerging research topic seeking to perform recommendations through interactive conversations, which generally consist of generation and recommendation modules. Prior work on CRS tends to incorporate more external and domain-specific knowledge like item reviews to enhance performance. Despite the fact that the collection and annotation of the external domain-specific information needs much human effort and degenerates the generalizability, too much extra knowledge introduces more difficulty to balance among them. Therefore, we propose to fully discover and extract the internal knowledge from the context. We capture both entity-level and contextual-level representations to jointly model user preferences for the recommendation, where a time-aware attention is designed to emphasize the recently appeared items in entity-level representations. We further use the pre-trained BART to initialize the generation module to alleviate the data scarcity and enhance the context modeling. In addition to conducting experiments on a popular dataset (ReDial), we also include a multi-domain dataset (OpenDialKG) to show the effectiveness of our model. Experiments on both datasets show that our model achieves better performance on most evaluation metrics with less external knowledge and generalizes well to other domains. Additional analyses on the recommendation and generation tasks demonstrate the effectiveness of our model in different scenarios.},
author = {Lingzhi Wang and Shafiq Joty and Wei Gao and Xingshan Zeng and Kam-Fai Wong},
journal = {IEEE Transactions on Knowledge and Data Engineering},
series = {IEEE},
title = {{Improving conversational recommender system via contextual and time-aware modeling with less domain-specific knowledge}},
url = {https://arxiv.org/pdf/2209.11386},
year = {2024}
}
@inproceedings{Zhao-et-al-EACL-24,
abstract = {To encourage fairness and transparency, there exists an urgent demand for deriving reliable explanations for large language models (LLMs). One promising solution is concept-based explanations, i.e., human-understandable concepts from internal representations. However, due to the compositional nature of languages, current methods mostly discover correlational explanations instead of causal features. Therefore, we propose a novel framework to provide impact-aware explanations for users to understand the LLM's behavior, which are robust to feature changes and influential to the model's predictions. Specifically, we extract predictive high-level features (concepts) from the model's hidden layer activations. Then, we innovatively optimize for features whose existence causes the output predictions to change substantially. Extensive experiments on real and synthetic tasks demonstrate that our method achieves superior results on predictive impact, explainability, and faithfulness compared to the baselines, especially for LLMs.},
address = {Malta},
author = {Ruochen Zhao and Shafiq Joty and Yongjie Wang and Tan Wang},
booktitle = {Findings of ACL},
issue = {},
pages = {},
series = {EACL-24},
title = {{Explaining Language Model Predictions with High-Impact Concepts}},
url = {https://arxiv.org/abs/2305.02160},
year = {2024}
}
@inproceedings{Tu-et-al-EACL-24,
abstract = {Cross-lingual transfer of language models trained on high-resource languages like English has been widely studied for many NLP tasks, but focus on conversational tasks has been rather limited. This is partly due to the high cost of obtaining non-English conversational data, which results in limited coverage. In this work, we introduce for cross-lingual alignment pretraining, a parallel and large-scale multilingual conversation dataset that we created by translating the English-only Schema-Guided Dialogue (SGD) dataset (Rastogi et al., 2020) into 105 other languages. XSGD contains about 330k utterances per language. To facilitate aligned cross-lingual representations, we develop an efficient prompt-tuning-based method for learning alignment prompts. We also investigate two different classifiers: NLI-based and vanilla classifiers, and test cross-lingual capability enabled by the aligned prompts. We evaluate our model's cross-lingual generalization capabilities on two conversation tasks: slot-filling and intent classification. Our results demonstrate strong and efficient modeling ability of NLI-based classifiers and the large cross-lingual transfer improvements achieved by our aligned prompts, particularly in few-shot settings. We also conduct studies on large language models (LLMs) such as text-davinci-003 and ChatGPT in both zero- and few-shot settings. While LLMs exhibit impressive performance in English, their cross-lingual capabilities in other languages, particularly low-resource ones, are limited.},
address = {Malta},
author = {Lifu Tu and Jin Qu and Semih Yavuz and Shafiq Joty and Wenhao Liu and Caiming Xiong and Yingbo Zhou},
booktitle = {In Findings of ACL},
issue = {},
pages = {},
series = {EACL-24},
title = {{Efficiently Aligned Cross-Lingual Transfer Learning for Conversational Tasks using Prompt-Tuning}},
url = {https://arxiv.org/abs/2304.01295},
year = {2024}
}
@inproceedings{Chen-emnlp-23,
abstract = {With the rise of powerful closed-sourced LLMs (ChatGPT, GPT-4), there are increasing interests in distilling the capabilies of close-sourced LLMs to smaller open-sourced LLMs. Previous distillation methods usually prompt ChatGPT to generate a set of instructions and answers, for the student model to learn. However, such standard distillation approach neglects the merits and conditions of the student model. Inspired by modern teaching principles, we design a personalised distillation process, in which the student attempts to solve a task first, then the teacher provides an adaptive refinement for the student to improve. Instead of feeding the student with teacher's prior, personalised distillation enables personalised learning for the student model, as it only learns on examples it makes mistakes upon and learns to improve its own solution. On code generation, personalised distillation consistently outperforms standard distillation with only one third of the data. With only 2.5-3K personalised examples that incur a data-collection cost of 4-6$, we boost CodeGen-mono-16B by 7% to achieve 36.4% pass@1 and StarCoder by 12.2% to achieve 45.8% pass@1 on HumanEval.},
address = {Singapore},
author = {Hailin Chen and Amrita Saha and Shafiq Joty and Steven Hoi},
booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'23},
title = {Personalized Distillation: Empowering Open-Sourced LLMs with Adaptive Learning for Code Generation},
url = {},
year = {2023}
}
@inproceedings{Qin-emnlp-23,
abstract = {Lifelong sequence generation (LSG), a problem in continual learning, aims to continually train a model on a sequence of generation tasks to learn constantly emerging new generation patterns while avoiding the forgetting of previous knowledge. Existing LSG methods mainly focus on maintaining old knowledge while paying little attention to knowledge transfer across tasks. In contrast, humans can better learn new tasks by leveraging previously acquired knowledge from similar tasks. Inspired by the learning paradigm of humans, we propose Dynamic Module Expansion and Adaptation (DMEA), which enables the model to dynamically determine the architecture for acquiring new knowledge based on task correlation and select the most similar previous tasks to facilitate adaptation to new tasks. In addition, as the learning process can easily be biased towards the current task which might cause more severe forgetting of previously learned knowledge, we propose dynamic gradient scaling to balance the learning of the current task and replayed tasks. With extensive experiments, we demonstrate that DMEA can consistently outperform existing methods in different LSG settings.},
address = {Singapore},
author = {Chengwei Qin and Shafiq Joty and CHEN CHEN},
booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'23},
title = {Lifelong Sequence Generation with Dynamic Module Expansion and Adaptation},
url = {},
year = {2023}
}
@inproceedings{Masry-emnlp-23,
abstract = {Charts are widely used for data analysis, providing visual representations and insights into complex data. To facilitate chart-based data analysis using natural language, several downstream tasks have been introduced recently such as chart question answering and chart summarization. However, existing methods for these tasks often rely on pretraining on language or vision-language tasks, neglecting the explicit modeling of chart structures (e.g., how chart elements are related to each other). To address this, we first build a large corpus of charts covering diverse topics and visual styles. We then present UniChart, a pretrained model for chart comprehension and reasoning. UniChart encodes the relevant text, data, and visual elements of charts and then uses a chart-grounded text decoder for text generation. We propose several chart-specific pretraining tasks that include: (i) low-level tasks to extract the visual elements (e.g., bars, lines) and data from charts, and (ii) high-level tasks to acquire chart understanding and reasoning skills. Our experiments demonstrate that pretraining UniChart on a large corpus with chart-specific objectives, followed by fine-tuning, yields state-of-the-art performance on four downstream tasks. Moreover, our model exhibits superior generalizability to unseen chart corpus, surpassing previous approaches that lack chart-specific objectives and utilize limited chart resources.},
address = {Singapore},
author = {Ahmed Masry and Parsa Kavehzadeh and Do Long and Enamul Hoque and Shafiq Joty},
booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'23},
title = {UniChart: A Universal Vision-language Pretrained Model for Chart Comprehension and Reasoning},
url = {https://arxiv.org/abs/2305.14761},
year = {2023}
}
@inproceedings{Weishi-emnlp-23,
abstract = {Automatic program repair (APR) has gained increasing attention as an essential technique in software development to reduce manual debugging efforts and boost developers' productivity. Recent advances in deep learning (DL) based models have demonstrated promising results by learning from large-scale bug-fix examples in a data-driven manner. However, in practical scenarios, software bugs have an imbalanced distribution, and the fixing knowledge learned by APR models often only capture the patterns of frequent error types, making it inapplicable to handle the rare error types. To address this limitation, we investigate a novel task of low-resource APR, and propose Meta-APR, a new meta-learning framework integrated with code pretrained language models to generate fixes for low-resource bugs with limited training samples. Our Meta-APR learns better error-specific knowledge from high-resource bugs through efficient first-order meta-learning optimization, which allows for a faster adaptation to the target low-resource bugs. Besides, while we adopt CodeT5, a pretrained code-aware encoder-decoder Transformer, as the backbone model for Meta-APR, it is a model-agnostic framework that can be integrated with any neural models. Extensive experimental results on three benchmarks in various programming languages verify the superiority of our method over existing DL-based APR approaches.},
address = {Singapore},
author = {Weishi Wang and Yue Wang and Shafiq Joty and Steven Hoi},
booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'23},
title = {Towards Low-Resource Automatic Program Repair with Meta-Learning and Pretrained Language Models},
url = {},
year = {2023}
}
@inproceedings{Laban-emnlp-23,
abstract = {With the recent appearance of LLMs in practical settings, having methods that can effectively detect factual inconsistencies is crucial to reduce the propagation of misinformation and improve trust in model outputs. When testing on existing factual consistency benchmarks, we find that a few large language models (LLMs) perform competitively on classification benchmarks for factual inconsistency detection compared to traditional non-LLM methods. However, a closer analysis reveals issues with existing evaluation benchmarks, affecting evaluation precision. To address this, we propose a new protocol for inconsistency detection benchmark creation and implement it in a 10-domain benchmark called SummEdits. This new benchmark is 20 times more cost-effective per sample than previous benchmarks and highly reproducible, as we estimate inter-annotator agreement at about 0.9. Most LLMs struggle on SummEdits, with performance close to random chance. The best-performing model, GPT-4, is still 8% below estimated human performance, highlighting the gaps in LLMs' ability to reason about facts and detect inconsistencies when they occur.},
address = {Singapore},
author = {Philippe Laban and Wojciech Kryscinski and Divyansh Agarwal and Alexander Fabbri and Caiming Xiong and Shafiq Joty and Chien-Sheng Wu},
booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'23},
title = {SummEdits: Measuring LLM Ability at Factual Reasoning Through The Lens of Summarization},
url = {https://arxiv.org/abs/2305.14540},
year = {2023}
}
@inproceedings{Yixin-emnlp-23,
abstract = {Interpretability and efficiency are two important considerations for the adoption of neural automatic metrics. In this work, we develop strong-performing automatic metrics for reference-based summarization evaluation, based on a two-stage evaluation pipeline that first extracts basic information units from one text sequence and then checks the extracted units in another sequence. The metrics we developed include two-stage metrics that can provide high interpretability at both the fine-grained unit level and summary level, and one-stage metrics that achieve a balance between efficiency and interoperability. We make the developed tools publicly available through a Python package and GitHub.},
address = {Singapore},
author = {Yixin Liu and Alex Fabbri and Pengfei Liu and Shafiq Joty and Chien-Sheng Wu and Caiming Xiong and Dragomir Radev},
booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'23},
title = {Towards Interpretable and Efficient Automatic Reference-Based Summarization Evaluation},
url = {https://arxiv.org/abs/2303.03608},
year = {2023}
}
@inproceedings{Zhao-emnlp-23,
abstract = {With the rise of powerful closed-sourced LLMs (ChatGPT, GPT-4), there are increasing interests in distilling the capabilies of close-sourced LLMs to smaller open-sourced LLMs. Previous distillation methods usually prompt ChatGPT to generate a set of instructions and answers, for the student model to learn. However, such standard distillation approach neglects the merits and conditions of the student model. Inspired by modern teaching principles, we design a personalised distillation process, in which the student attempts to solve a task first, then the teacher provides an adaptive refinement for the student to improve. Instead of feeding the student with teacher's prior, personalised distillation enables personalised learning for the student model, as it only learns on examples it makes mistakes upon and learns to improve its own solution. On code generation, personalised distillation consistently outperforms standard distillation with only one third of the data. With only 2.5-3K personalised examples that incur a data-collection cost of 4-6$, we boost CodeGen-mono-16B by 7% to achieve 36.4% pass@1 and StarCoder by 12.2% to achieve 45.8% pass@1 on HumanEval.},
address = {Singapore},
author = {Ruochen Zhao and Hailin Chen and Weishi Wang and Fangkai Jiao and Do Long and Chengwei Qin and Bosheng Ding and Xiaobao Guo and Minzhi Li and Xingxuan Li and Shafiq Joty},
booktitle = {Findings of the 2023 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'23 Findings},
title = {Retrieving Multimodal Information for Augmented Generation: A Survey},
url = {https://arxiv.org/abs/2303.10868},
year = {2023}
}
@inproceedings{Liu-emnlp-23,
abstract = {The dominant paradigm of textual question answering systems is based on end-to-end neural networks, which excels at answering natural language questions but falls short on complex ones. This stands in contrast to the broad adaptation of semantic parsing approaches over structured data sources (e.g., relational database, knowledge graphs), that convert natural language questions to logical forms and execute them with query engines. Towards combining the strengths of neural and symbolic methods, we propose a framework of question parsing and execution on textual QA. It comprises two central pillars: (1) We parse the question of varying complexity into an intermediate representation, named H-expression, which is composed of simple questions as the primitives and symbolic operations representing the relationships among them; (2) To execute the resulting H-expressions, we design a hybrid executor, which integrates the deterministic rules to translate the symbolic operations with a drop-in neural reader network to answer each decomposed simple question. Hence, the proposed framework can be viewed as a top-down question parsing followed by a bottom-up answer backtracking. The resulting H-expressions closely guide the execution process, offering higher precision besides better interpretability while still preserving the advantages of the neural readers for resolving its primitive elements. Our extensive experiments on MuSiQue, 2WikiQA, HotpotQA, and NQ show that the proposed parsing and hybrid execution framework outperforms existing approaches in supervised, few-shot, and zero-shot settings, while also effectively exposing its underlying reasoning process.},
address = {Singapore},
author = {Ye Liu and Semih Yavuz and Rui Meng and Dragomir Radev and Caiming Xiong and Shafiq Joty and Yingbo Zhou},
booktitle = {Findings of the 2023 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'23 Findings},
title = {HPE: Answering Complex Questions over Text by Hybrid Question Parsing and Execution},
url = {},
year = {2023}
}
@inproceedings{Zhao-acl-23,
abstract = {As large language models (LLMs) have become the norm in NLP, demonstrating good performance in generation and reasoning tasks, one of its most fatal disadvantages is the lack of factual correctness. Generating unfactual texts not only leads to lower performances but also degrades the trust and validity of their applications. Chain-of-Thought (CoT) prompting improves trust and model performance on complex reasoning tasks by generating interpretable reasoning chains, but still suffers from factuality concerns in knowledge-intensive tasks. In this paper, we propose the Verify-and-Edit framework for CoT prompting, which seeks to increase prediction factuality by post-editing reasoning chains according to external knowledge. Building on top of GPT-3, our framework lead to accuracy improvements in multiple open-domain question-answering tasks.},
address = {Toronto, Canada},
author = {Ruochen Zhao and Xingxuan Li and Shafiq Joty and Chengwei Qin and Lidong Bing},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23},
title = {Verify-and-Edit: A Knowledge-Enhanced Chain-of-Thought Framework},
url = {https://arxiv.org/abs/2305.03268},
year = {2023}
}
@inproceedings{Yin-acl-23,
abstract = {Large language models (LLMs) have shown impressive performance in following natural language instructions to solve unseen tasks. However, it remains unclear whether models truly understand task definitions and whether the human-written definitions are optimal. In this paper, we systematically study the role of task definitions in instruction learning. We first conduct an ablation analysis informed by human annotations to understand which parts of a task definition are most important, and find that model performance only drops substantially when removing contents describing the task output, in particular label information. Next, we propose an automatic algorithm to compress task definitions to a minimal supporting set of tokens, and find that 60\% of tokens can be removed while maintaining or even improving model performance. Based on these results, we propose two strategies to help models better leverage task instructions: (1) providing only key information for tasks in a common structured format, and (2) adding a meta-tuning stage to help the model better understand the definitions. With these two strategies, we achieve a 4.2 Rouge-L improvement over 119 unseen test tasks.},
address = {Toronto, Canada},
author = {Fan Yin and Jesse Vig and Philippe Laban and Shafiq Joty and Caiming Xiong and Chien-Sheng Jason Wu},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23},
title = {Did You Read the Instructions? Rethinking the Effectiveness of Task Definitions in Instruction Learning},
url = {},
year = {2023}
}
@inproceedings{yixin-acl23,
abstract = {Human evaluation is the foundation upon which the evaluation of both summarization systems and automatic metrics rests. However, existing human evaluation studies for summarization either exhibit a low inter-annotator agreement or have insufficient scale, and an in-depth analysis of human evaluation is lacking. Therefore, we address the shortcomings of existing summarization evaluation along the following axes: (1) We propose a modified summarization salience protocol, Atomic Content Units (ACUs), which is based on fine-grained semantic units and allows for a high inter-annotator agreement. (2) We curate the Robust Summarization Evaluation (RoSE) benchmark, a large human evaluation dataset consisting of 22,000 summary-level annotations over 28 top-performing systems on three datasets. (3) We conduct a comparative study of four human evaluation protocols, underscoring potential confounding factors in evaluation setups. (4) We evaluate 50 automatic metrics and their variants using the collected human annotations across evaluation protocols and demonstrate how our benchmark leads to more statistically stable and significant results. Furthermore, our findings have important implications for evaluating large language models (LLMs), as we show that LLMs adjusted by human feedback (e.g., GPT-3.5) may overfit unconstrained human evaluation, which is affected by the annotators' prior, input-agnostic preferences, calling for more robust, targeted evaluation methods.},
address = {Toronto, Canada},
author = {Yixin Liu and Alex Fabbri and Pengfei Liu and Yilun Zhao and Linyong Nan and Ruilin Han and Simeng Han and Shafiq Joty and Chien-Sheng Jason Wu and Caiming Xiong and Dragomir Radev},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23},
title = {Revisiting the Gold Standard: Grounding Summarization Evaluation with Robust Human Evaluation},
url = {https://arxiv.org/abs/2212.07981},
year = {2023}
}
@inproceedings{Laban-acl-23,
abstract = {Text simplification research has mostly focused on sentence-level simplification, even though many desirable edits - such as adding relevant background information or reordering content - may require document-level context.
Prior work has also predominantly framed simplification as a single-step, input-to-output task, only implicitly modeling the fine-grained, span-level edits that elucidate the simplification process. To address both gaps, we introduce the SWiPE dataset, which reconstructs the document-level editing process from English Wikipedia (EW) articles to paired Simple Wikipedia (SEW) articles. In contrast to prior work, SWiPE leverages the entire revision history when pairing pages in order to better identify simplification edits. We work with Wikipedia editors to annotate 5,000 EW-SEW document pairs, labeling more than 40,000 edits with proposed 19 categories. To scale our efforts, we propose several models to automatically label edits, achieving an F-1 score of up to 71.8, indicating that this is a tractable but challenging NLU task. Finally, we categorize the edits produced by several simplification models and find that SWiPE-trained models generate more complex edits while reducing unwanted edits.},
address = {Toronto, Canada},
author = {Philippe Laban and Jesse Vig and Wojciech Kryscinski and Shafiq Joty and Caiming Xiong and Chien-Sheng Jason Wu},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23},
title = {SWiPE: A Dataset for Document-Level Simplification of Wikipedia Pages},
url = {https://arxiv.org/abs/2305.19204},
year = {2023}
}
@inproceedings{Moon-acl-23,
abstract = {Large-scale pre-trained language models have shown outstanding performance in a variety of NLP tasks. However, they are also known to be significantly brittle against specifically crafted adversarial examples, leading to increasing interest in probing the adversarial robustness of NLP systems. We introduce RSMI, a novel two-stage framework that combines randomized smoothing (RS) with masked inference (MI) to improve the adversarial robustness of NLP systems. RS transforms a classifier into a smoothed classifier to obtain robust representations, whereas MI forces a model to exploit the surrounding context of a masked token in an input sequence. RSMI improves adversarial robustness by 2 to 3 times over existing state-of-the-art methods on benchmark datasets. We also perform in-depth qualitative analysis to validate the effectiveness of the different stages of RSMI and probe the impact of its components through extensive ablations. By empirically proving the stability of RSMI, we put it forward as a practical method to robustly train large-scale NLP models. Our code and datasets are available at https://anonymous.4open.science/r/RSMI.},
address = {Toronto, Canada},
author = {Han Cheol Moon and Shafiq Joty and Ruochen Zhao and Megh Thakkar and Chi Xu},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23},
title = {Randomized Smoothing with Masked Inference for Adversarially Robust Text Classification},
url = {https://arxiv.org/abs/2305.06522},
year = {2023}
}
@inproceedings{Qin-acl-23,
abstract = {Prompt tuning (PT) which only tunes the embeddings of an additional sequence of tokens per task, keeping the pre-trained language model (PLM) frozen, has shown remarkable performance in few-shot learning. Despite this, PT has been shown to rely heavily on good initialization of the prompt embeddings. In this work, we study meta prompt tuning (MPT) to systematically explore how meta-learning can help improve (if it can) cross-task generalization in PT through learning to initialize the prompt embeddings from other relevant tasks. We empirically analyze a representative set of meta learning algorithms in a wide range of adaptation settings with different source/target task configurations on a large set of few-shot tasks. With extensive experiments and analysis, we demonstrate the effectiveness of MPT. We find the improvement to be significant particularly on classification tasks. For other kinds of tasks such as question answering, we observe that while MPT can outperform PT in most cases, it does not always outperform multi-task learning. We further provide an in-depth analysis from the perspective of task similarity.},
address = {Toronto, Canada},
author = {Chengwei Qin and Shafiq Joty and Qian Li and Ruochen Zhao},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23},
title = {Learning to Initialize: Can Meta Learning Improve Cross-task Generalization in Prompt Tuning?},
url = {https://arxiv.org/abs/2302.08143},
year = {2023}
}
@inproceedings{linlin-acl-23,
abstract = {Due to the huge amount of parameters, fine-tuning of pretrained language models (PLMs) is prone to overfitting in the low-resource scenarios. In this work, we present a novel method that operates on the hidden representations of a PLM to reduce overfitting. During fine-tuning, our method inserts random autoencoders between the hidden layers of a PLM, which transform activations from the previous layers into a multi-view compressed representation before feeding it into the upper layers. The autoencoders are plugged out after fine-tuning, so our method does not add extra parameters or increase computation cost during inference. Our method demonstrates promising performance improvement across a wide range of sequence- and token-level low-resource NLP tasks. We will make our source code publicly available for research purposes.},
address = {Toronto, Canada},
author = {Linlin Liu and Xingxuan Li and Megh Thakkar and Xin Li and Shafiq Joty and Luo Si and Lidong Bing},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23},
title = {Towards Robust Low-Resource Fine-Tuning with Multi-View Compressed Representations},
url = {https://arxiv.org/abs/2211.08794},
year = {2023}
}
@inproceedings{Ding-acl-23,
abstract = {Data annotation is the process of labeling data that could be used to train machine learning models. It is a crucial step in the development of NLP systems, as it allows the model to learn the relationship between the input data and the desired output. Generative Pre-trained Transformer 3 (GPT-3), a large-scale language model developed by OpenAI, has demonstrated impressive zero- and few-shot performance on a wide range of NLP tasks. It is therefore natural to wonder whether it can be used to effectively annotate data for NLP tasks. In this paper, we evaluate the performance of GPT-3 as a data annotator by comparing it with traditional data annotation methods and analyzing its output on a range of tasks. Through this analysis, we aim to provide insight into the potential of GPT-3 as a general-purpose data annotator in NLP.},
address = {Toronto, Canada},
author = {BOSHENG DING and Chengwei Qin and Linlin Liu and Yew Ken Chia and Lidong Bing and Boyang Li and Shafiq Joty},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23},
title = {Is GPT-3 a Good Data Annotator?},
url = {https://arxiv.org/abs/2212.10450},
year = {2023}
}
@inproceedings{Long-acl-23,
abstract = {Conversational Question Generation (CQG) is a critical task for machines to assist humans in fulfilling their information needs through conversations. The task is generally cast into two different settings: answer-aware and answer-unaware. While the former facilitates the models by exposing the expected answer, the latter is more realistic and receiving growing attentions recently. What-to-ask and how-to-ask are the two main challenges in the answer-unaware setting. To address the first challenge, existing methods mainly select sequential sentences in context as the rationales. We argue that the conversation generated using such naive heuristics may not be natural enough as in reality, the interlocutors often talk about the relevant contents that are not necessarily sequential in context. Additionally, previous methods decide the type of question to be generated (boolean/span-based) implicitly. Modeling the question type explicitly is crucial as the answer, which hints the models to generate a boolean or span-based question, is unavailable. To this end, we present SG-CQG, a two-stage CQG framework. For the what-to-ask stage, a sentence is selected as the rationale from a semantic graph that we construct, and extract the answer span from it. For the how-to-ask stage, a classifier determines the target answer type of the question via two explicit control signals before generating and filtering. In addition, we propose Conv-Distinct, a novel evaluation metric for CQG, to evaluate the diversity of the generated conversation from a context. Compared with the existing answer-unaware CQG models, the proposed SG-CQG achieves state-of-the-art performance.},
address = {Toronto, Canada},
author = {Xuan Long Do and Bowei Zou and Shafiq Joty and Tran Tai and Liangming Pan and Nancy Chen and Ai Ti Aw},
booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23},
title = {Modeling What-to-ask and How-to-ask for Answer-unaware Conversational Question Generation},
url = {https://arxiv.org/abs/2305.03088},
year = {2023}
}
@inproceedings{Laskar-acl-23,
abstract = {While large language models have the potential to benefit society greatly, it is important that we use them responsibly. The recently released ChatGPT language model has drawn a lot of attention, with discussions ongoing whether it achieves impressive performance due to its memorization power for being trained on a massive amount of data, or if it has complex reasoning capability to solve challenging tasks. This paper aims to systematically evaluate these issues. In particular, it focuses on investigating the performance of ChatGPT on academic benchmark datasets, assessing the quality of its generated text, exploring its open-domain knowledge and commonsense reasoning, as well as emerging capability. In addition, we study its potential limitations, such as biases, misinformation generation, and ethical concerns. Our extensive evaluation shows that even though ChatGPT is capable of performing a wide variety of tasks, and may obtain impressive performance in several benchmark datasets, it is still far from achieving the ability to reliably solve many language processing tasks in challenging scenarios.},
address = {Toronto, Canada},
author = {Md Tahmid Rahman Laskar and M Saiful Bari and Mizanur Rahman and Md Amran Hossen Bhuiyan and Shafiq Joty and Jimmy Huang},
booktitle = {Findings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23 Findings},
title = {A Systematic Study of ChatGPT on Benchmark Datasets},
url = {https://arxiv.org/abs/2305.18486},
year = {2023}
}
@inproceedings{Li-acl-23,
abstract = {With the evolution of Knowledge Graphs (KGs), new entities emerge which are not seen before. Representation learning of KGs in such an inductive setting aims to capture and transfer the structural patterns from existing entities to new entities. However, the performance of existing methods in inductive KGs are limited by sparsity and implicit transfer. In this paper, we propose VMCL, a Contrastive Learning (CL) framework with graph guided Variational autoencoder on Meta-KGs in the inductive setting. We first propose representation generation to capture the encoded and generated representations of entities, where the generated variations can densify representations with complementary features. Then, we design two CL objectives that work across entities and meta-KGs to simulate the transfer mode. With extensive experiments we demonstrate that our proposed VMCL can significantly outperform previous state-of-the-art baselines.},
address = {Toronto, Canada},
author = {Qian Li and Shafiq Joty and Daling Wang and Shi Feng and Yifei Zhang and Chengwei Qin},
booktitle = {Findings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23 Findings},
title = {Contrastive Learning with Generated Representations for Inductive Knowledge Graph Embedding},
url = {},
year = {2023}
}
@inproceedings{Ravaut-acl-23,
abstract = {With the rise of task-specific pre-training objectives, abstractive summarization models like PEGASUS offer appealing zero-shot performance on downstream summarization tasks. However, the performance of such unsupervised models still lags significantly behind their supervised counterparts. Similarly to the supervised setup, we notice a very high variance in quality among summary candidates from these models whereas only one candidate is kept as the summary output. In this paper, we propose to re-rank summary candidates in an unsupervised manner, aiming to close the performance gap between unsupervised and supervised models. Our approach improves the pre-trained unsupervised PEGASUS by 4.37% to 7.27% relative mean ROUGE across four widely-adopted summarization benchmarks, and achieves relative gains of 7.51% (up to 23.73% from XSum to WikiHow) averaged over 30 transfer setups.},
address = {Toronto, Canada},
author = {Mathieu Ravaut and Shafiq Joty and Nancy Chen},
booktitle = {Findings of the 61st Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'23 Findings},
title = {Unsupervised Summarization Re-ranking},
url = {https://arxiv.org/abs/2212.09593},
year = {2023}
}
@inproceedings{Gotmare-fse-23,
abstract = {The goal of natural language semantic code search is to retrieve a semantically relevant code snippet from a fixed set of candidates using a natural language query. Existing approaches are neither effective nor efficient enough towards a practical semantic code search system. In this paper, we propose an efficient and accurate semantic code search framework with cascaded fast and slow models, in which a fast transformer encoder model is learned to optimize a scalable index for fast retrieval followed by learning a slow classification-based re-ranking model to improve the performance of the top K results from the fast retrieval. To further reduce the high memory cost of deploying two separate models in practice, we propose to jointly train the fast and slow model based on a single transformer encoder with shared parameters. The proposed cascaded approach is not only efficient and scalable, but also achieves state-of-the-art results with an average mean reciprocal ranking (MRR) score of 0.7795 (across 6 programming languages) as opposed to the previous state-of-the-art result of 0.713 MRR on the CodeSearchNet benchmark.},
address = {San Francisco, USA},
author = {Akhilesh Gotmare and Junnan Li and Shafiq Joty and Steven Hoi},
booktitle = {ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering},
publisher = {ACM},
series = {ESEC/FSE 2023},
title = {Efficient Text-to-Code Retrieval with Cascaded Fast and Slow Transformer Models},
url = {https://esecfse2023.hotcrp.com/paper/1696?cap=hcav1696rpuCAbgbndEWUuXrFXLtAdaC},
year = {2023}
}
@inproceedings{Weishi-fse-23,
abstract = {Automatic program repair (APR) is crucial to reduce manual debugging efforts for developers and improve software reliability. While conventional search-based techniques typically rely on heuristic rules or a redundancy assumption to mine fix patterns, recent years have witnessed the surge of deep learning (DL) based approaches to automate the program repair process in a data-driven manner. However, their performance is often limited by a fixed set of parameters to model the highly complex search space of APR.
To ease such burden on the parametric models, in this work, we propose a novel Retrieval-Augmented Patch Generation framework (RAP-Gen) by explicitly leveraging relevant fix patterns retrieved from a codebase of previous bug-fix pairs Specifically, we build a hybrid patch retriever to account for both lexical and semantic matching based on the raw source code in a language-agnostic manner, which does not rely on any code-specific features. In addition, we adapt a code-aware language model CodeT5 as our founda tion model to facilitate both patch retrieval and generation tasks in a unified manner. We adopt a stage-wise approach where the patch retriever first retrieves a relevant external bug-fix pair to augment the buggy input for the CodeT5 patch generator, which synthesizes a ranked list of repair patch candidates. Notably, RAP-Gen is a generic APR framework that can flexibly integrate different patch retrievers and generators to repair various types of bugs.
We thoroughly evaluate RAP-Gen on three benchmarks in two programming languages, including the TFix benchmark in JavaScript, and Code Refinement and Defects4J benchmarks in Java, where the bug localization information may or may not be provided. Experimental results show that RAP-Gen significantly outperforms previous state-of-the-art (SoTA) approaches on all benchmarks, e.g., boosting the accuracy of T5-large on TFix from 49.70% to 54.15% (repairing 478 more bugs) and repairing 15 more bugs on 818 Defects4J bugs. Further analysis reveals that our patch retriever can search for relevant fix patterns to guide the APR systems.},
address = {San Francisco, USA},
author = {Weishi Wang and Yue Wang and Shafiq Joty and Steven Hoi},
booktitle = {ACM Joint European Software Engineering Conference and Symposium on the Foundations of Software Engineering},
publisher = {ACM},
series = {ESEC/FSE 2023},
title = {RAP-Gen: Retrieval-Augmented Patch Generation with CodeT5 for Automatic Program Repair},
url = {https://esecfse2023.hotcrp.com/paper/131?cap=hcav131uYSXRTTkAXhXijmopsAbPqRC},
year = {2023}
}
@inproceedings{XGen,
abstract = {Large Language Models (LLMs) have become ubiquitous across various domains, transforming the way we interact with information and conduct research. However, most high-performing LLMs remain confined behind proprietary walls, hindering scientific progress. Most open-source LLMs, on the other hand, are limited in their ability to support longer sequence lengths, which is a key requirement for many tasks that require inference over an input context. To address this, we have trained XGen, a series of 7B parameter models on up to 8K sequence length for up to 1.5T tokens. We have also finetuned the XGen models on public-domain instructional data, creating their instruction-tuned counterparts (XGen-Inst). We open-source our models for both research advancements and commercial applications. Our evaluation on standard benchmarks shows that XGen models achieve comparable or better results when compared with state-of-the-art open-source LLMs. Our targeted evaluation on long sequence modeling tasks shows the benefits of our 8K-sequence models over 2K-sequence open-source LLMs.},
author = {Erik Nijkamp* and Tian Xie* and Hiroaki Hayashi* and Bo Pang* and Congying Xia* and Chen Xing and Jesse Vig and Semih Yavuz and Philippe Laban and Ben Krause and Senthil Purushwalkam and Tong Niu and Wojciech Kryscinski and Lidiya Murakhovs'ka and Prafulla Choubey and Alex Fabbri and Ye Liu and Rui Meng and Lifu Tu and Meghana Bhat and Chien-Sheng Wu and Silvio Savarese and Yingbo Zhou and Shafiq Joty+ and Caiming Xiong+},
series = {SAI Blog},
title = {Long Sequence Modeling with XGen: A 7B LLM Trained on 8K Input Sequence Length},
url = {https://arxiv.org/abs/2309.03450},
year = {2023}
}
@inproceedings{joty-etal-2023-nlp,
abstract = {Natural language and visualization (Vis) are two powerful modalities of human communication. The goal of this tutorial is to push forward the agenda of tightly integrating these two modalities. To this end, the tutorial will introduce NLP+Vis with a focus on two main threads of work: \textit{(i) NLP for Vis:} How to develop and adapt state-of-the-art NLP models for solving various visualization tasks? and \textit{(ii) Vis for NLP:} How to leverage visualization techniques to interpret and explain complex NLP models effectively? The tutorial will first motivate why NLP+Vis is an important area of research and provide an overview of research topics on combining NLP and Vis techniques. Then an overview of state-of-the-art deep learning models for NLP will be covered. Next, we will provide an overview of applying visualization techniques to help make NLP models more interpretable and explainable. In the final part, we will focus on various application tasks at the intersection of NLP and Vis. We will conclude with an interactive discussion of future challenges for NLP+Vis applications. The audience will include researchers interested in applying NLP for visualizations as well as others who focus more generally at the intersection of machine learning and visualization.},
address = {Singapore},
author = {Shafiq Joty and Enamul Hoque and Jesse Vig},
booktitle = {Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: Tutorial Abstracts},
pages = {1--6},
publisher = {Association for Computational Linguistics},
series = {EMNLP'23 Tutorial},
title = {{NLP}+{V}is: {NLP} Meets Visualization},
url = {https://aclanthology.org/2023.emnlp-tutorial.1},
year = {2023}
}
@inproceedings{Bojic-et-al-ML4H-22,
abstract = {Question Answering (QA) systems can support health coaches in facilitating clients' lifestyle behavior changes (e.g., in adopting healthy sleep habits). In this paper, we formulate a domain-specific QA task for sleep coaching. To this end, we release SleepQA, a dataset created from 7,005 passages comprising 4,250 training examples with single annotations and 750 examples with 5-way annotations. We train a bi-encoder retrieval system on our dataset and perform extensive automated and human evaluations of the resulting end-to-end QA system. Comparisons of our model with various baselines shows improvements for domain-specific natural language processing on real-world questions. We hope that this dataset will lead to wider research interest in this important health domain.},
address = {New Orleans, USA},
author = {Iva Bojic and Qi Ong and Megh Thakkar and Esha Kamran and Irving Shua and Rei Pang and Jessica Chen and Vaaruni Nayak and Shafiq Joty and Josip Car},
booktitle = {2022 Machine Learning for Health (Proceedings Track)},
numpages = {9},
publisher = {Proceedings for Machine Learning Research (PMLR)},
series = {ML4H@NeurIPS'22},
title = {SleepQA: A Health Coaching Dataset on Sleep for Extractive Question Answering},
url = {https://openreview.net/pdf?id=YJbrADZZ8l8},
year = {2022}
}
@inproceedings{Ravaut-et-al-emnlp-22,
abstract = {Sequence-to-sequence deep neural models fine-tuned for abstractive summarization can achieve great performance on datasets with enough human annotations. Yet, it has been shown that they have not reached their full potential, with a wide gap between the top beam search output and the \emph{oracle} beam. Recently, re-ranking methods have been proposed, to learn to select a better summary candidate. However, such methods are limited by the summary quality aspects captured by the first-stage candidates. To bypass this limitation, we propose a new paradigm in second-stage abstractive summarization called SummaFusion that fuses several summary candiates to produce a novel abstractive \emph{second-stage} summary. Our method works well on several summarization datasets, improving both the ROUGE scores and qualitative properties of fused summaries. It is especially good when the candidates to fuse are worse, such as in the few-shot setup where we set a new state-of-the-art.},
address = {Abu Dhabi, UAE},
author = {Mathieu Ravaut and Shafiq Joty and Nancy Chen},
booktitle = {the 2022 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'22},
title = {Towards Summary Candidates Fusion},
url = {https://arxiv.org/abs/2210.08779},
year = {2022}
}
@inproceedings{Chen-et-al-emnlp-22,
abstract = {Machine learning models usually assume i.i.d data during training and testing, but data and tasks in real world often change over time. To emulate the transient nature of real world, we propose a challenging but practical task: text classification \textit{in-the-wild}, which introduces different non-stationary training/testing stages. Decomposing a complex task into modular components can enable robust generalisation under such non-stationary environment. However, current modular approaches in NLP do not take advantage of recent advances in parameter efficient tuning of pretrained language models. To close this gap, we propose \textsc{\small{ModularPrompt}}, a label-modular prompt tuning framework for text classification tasks. In \textsc{\small{ModularPrompt}}, the input prompt consists of a sequence of soft \emph{label} prompts, each encoding modular knowledge related to the corresponding class label. In two of most formidable settings, \textsc{\small{ModularPrompt}} outperforms relevant baselines by a large margin demonstrating strong generalisation ability. We also conduct comprehensive analysis to validate whether the learned prompts satisfy properties of a modular representation.},
address = {Abu Dhabi, UAE},
author = {Hailin Chen and Amrita Saha and Shafiq Joty and Steven Hoi},
booktitle = {the 2022 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'22},
title = {Learning Label Modular Prompts for Text Classification in the Wild},
url = {},
year = {2022}
}
@inproceedings{Kantharaj-et-al-emnlp-22,
abstract = {Charts are very popular to analyze data and convey important insights. People often analyze visualizations to answer open-ended questions that require explanatory answers. Answering such questions are often difficult and
time-consuming as it requires a lot of cognitive and perceptual efforts. To address this challenge, we introduce a new task called OpenCQA, where the goal is to answer an open-ended question about a chart with descriptive texts. We present the annotation process and an in-depth analysis of our dataset. We implement and evaluate a set of baselines under three practical settings. In the first setting, a chart and the accompanying article is provided as input to the model. The second setting provides only the relevant paragraph(s) to the chart instead of the entire article, whereas the third setting requires the model to generate an answer solely based on the chart. Our analysis of the results show that the top performing models generally produce fluent and coherent text while they struggle to perform complex logical and arithmetic reasoning.},
address = {Abu Dhabi, UAE},
author = {Shankar Kantharaj and Xuan Long Do and Rixie Tiffany Leong and Jia Qing Tan and Enamul Hoque and Shafiq Joty},
booktitle = {the 2022 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'22},
title = {OpenCQA: Open-ended Question Answering with Charts},
url = {https://arxiv.org/abs/2210.06628},
year = {2022}
}
@inproceedings{Liu-et-al-emnlp-22,
abstract = {Knowledge-enhanced language representation learning has shown promising results across various knowledge-intensive NLP tasks. However, prior methods are limited in efficient utilization of multilingual knowledge graph (KG) data for language model (LM) pretraining. They often train LMs with KGs in indirect ways, relying on extra entity/relation embeddings to facilitate knowledge injection. In this work, we explore methods to make better use of the multilingual annotation and language agnostic property of KG triples, and present novel knowledge based multilingual language models (KMLMs) trained directly on the knowledge triples. We first generate a large amount of multilingual synthetic sentences using the Wikidata KG triples. Then based on the intra- and inter-sentence structures of the generated data, we design pretraining tasks to enable the LMs to not only memorize the factual knowledge but also learn useful logical patterns. Our pretrained KMLMs demonstrate significant performance improvements on a wide range of knowledge-intensive cross-lingual tasks, including named entity recognition (NER), factual knowledge retrieval, relation classification, and a newly designed logical reasoning task. Our code and pretrained models will be made publicly available.},
address = {Abu Dhabi, UAE},
author = {Linlin Liu and Xin Li and Ruidan He and Lidong Bing and Shafiq Joty and Luo Si},
booktitle = {the 2022 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'22},
title = {Enhancing Multilingual Language Model with Massive Multilingual Knowledge Triples},
url = {https://arxiv.org/abs/2111.10962},
year = {2022}
}
@inproceedings{Li-et-al-emnlp-22,
abstract = {Sparsity of formal knowledge and roughness of non-ontological construction methods make sparsity problem particularly prominent in Open Knowledge Graphs (OpenKGs). Due to sparse links, learning effective representation for few-shot entities becomes difficult. We hypothesize that by introducing negative samples, a contrastive learning (CL) formulation could be beneficial in such scenarios. However, existing CL methods consider binary objects while modeling KG triplets and they are too generic, i.e., they ignore zero-shot, few-shot and synonymity problems that appear in OpenKGs. To address this, we propose TernaryCL, a CL framework based on ternary propagation patterns among head, relation and tail. TernaryCL designs \emph{Contrastive Entity} and \emph{Contrastive Relation} to mine ternary discriminative features by considering both negative entities and relations. It also introduces \emph{Contrastive Self} to help zero- and few-shot entities learn discriminative features, \emph{Contrastive Synonym} to consider synonymous entities, and \emph{Contrastive Fusion} to aggregate graph features from multiple paths. With extensive experiments on benchmark datasets, we demonstrate the superiority of TernaryCL over state-of-the-art models.},
address = {Abu Dhabi, UAE},
author = {Qian Li and Shafiq Joty and Daling Wang and Shi Feng and Yifei Zhang},
booktitle = {the 2022 Conference on Empirical Methods in Natural Language Processing (Findings)},
publisher = {ACL},
series = {EMNLP'22},
title = {Alleviating Sparsity of Open Knowledge Graphs with Ternary Contrastive Learning},
url = {https://arxiv.org/abs/2211.03950},
year = {2022}
}
@inproceedings{Mohiuddin-et-al-emnlp-22,
abstract = {Knowledge-enhanced language representation learning has shown promising results across various knowledge-intensive NLP tasks. However, prior methods are limited in efficient utilization of multilingual knowledge graph (KG) data for language model (LM) pretraining. They often train LMs with KGs in indirect ways, relying on extra entity/relation embeddings to facilitate knowledge injection. In this work, we explore methods to make better use of the multilingual annotation and language agnostic property of KG triples, and present novel knowledge based multilingual language models (KMLMs) trained directly on the knowledge triples. We first generate a large amount of multilingual synthetic sentences using the Wikidata KG triples. Then based on the intra- and inter-sentence structures of the generated data, we design pretraining tasks to enable the LMs to not only memorize the factual knowledge but also learn useful logical patterns. Our pretrained KMLMs demonstrate significant performance improvements on a wide range of knowledge-intensive cross-lingual tasks, including named entity recognition (NER), factual knowledge retrieval, relation classification, and a newly designed logical reasoning task. Our code and pretrained models will be made publicly available.},
address = {Abu Dhabi, UAE},
author = {Tasnim Mohiuddin and Philipp Koehn and Vishrav Chaudhary and James Cross and Shruti Bhosale and Shafiq Joty},
booktitle = {the 2022 Conference on Empirical Methods in Natural Language Processing (Findings)},
publisher = {ACL},
series = {EMNLP'22},
title = {Data Selection Curriculum for Neural Machine Translation},
url = {https://arxiv.org/abs/2203.13867},
year = {2022}
}
@inproceedings{Wang-et-al-emnlp-22,
abstract = {We present BotSIM, a data-efficient end-to-end Bot SIMulation framework for commercial task-oriented dialog (TOD) systems. BotSIM consists of three major components: 1) a Generator that can infer semantic-level dialog acts and entities from bot definitions and generate conversations via model-based paraphrasing; 2) an agenda-based dialog user Simulator to communicate with the dialog agents; 3) a Remediator to analyze and visualize the bot health reports and provide actionable remediation suggestions for troubleshooting and improving the dialog system.
We demonstrate BotSIM's effectiveness in end-to-end evaluation, remediation and multi-intent dialog generation via case studies on two commercial bot platforms. BotSIM's "generation-simulation-remediation'" paradigm accelerates the end-to-end bot evaluation and iteration process by: 1) reducing the effort needed to create test cases; 2) enabling a better understanding of both NLU and end-to-end performance via extensive dialog simulation; 3) improving the bot troubleshooting process with actionable suggestions from simulation results analysis. A demo of our system can be found at https://tinyurl.com/mryu74cd and a demo video at https://youtu.be/qLPJm6_UOKY.},
address = {Abu Dhabi, UAE},
author = {Guangsen Wang and Samson Tan and Shafiq Joty and Gang Wu and Jimmy Au and Steven Hoi},
booktitle = {the 2022 Conference on Empirical Methods in Natural Language Processing (demo)},
publisher = {ACL},
series = {EMNLP'22},
title = {BotSIM: An End-to-End Bot Simulation Framework for Commercial Task-Oriented Dialog Systems},
url = {https://www.youtube.com/watch?v=qLi5iSoly30},
year = {2022}
}
@inproceedings{nguyen2022umt,
abstract = {Numerous recent work on unsupervised machine translation (UMT) implies that competent unsupervised translations of low-resource and unrelated languages, such as Nepali or Sinhala, are only possible if the model is trained in a massive multilingual environment, where these low-resource languages are mixed with
high-resource counterparts. Nonetheless, while the high-resource languages greatly help kick-start the target low-resource translation tasks, the language discrepancy between them may hinder their further improvement. In this work, we propose a simple refinement procedure to disentangle languages from a pre-trained multilingual UMT model for it to focus on only the target low-resource task. Our method achieves the state of the art in the fully unsupervised translation tasks of English to Nepali, Sinhala, Gujarati, Latvian, Estonian and Kazakh, with BLEU score gains of 3.5, 3.5, 3.3, 4.1, 4.2, and 3.3, respectively. Our codebase is available at anonymous.4open.science/r/fairseq-py-BB44.},
address = {New Orleans, USA},
author = {Xuan-Phi Nguyen and Shafiq Joty and Wu Kui and Ai Ti Aw},
booktitle = {2022 Conference on Neural Information Processing Systems},
numpages = {9},
publisher = {},
series = {NeurIPS'22},
title = {Refining Low-Resource Unsupervised Translation by Language Disentanglement of Multilingual Translation Model},
url = {https://arxiv.org/abs/2205.15544},
year = {2022}
}
@inproceedings{Moon-KDD-22,
abstract = {We present GradMask, a simple adversarial example detection scheme for natural language processing (NLP) models. It uses gradient signals to detect adversarially perturbed tokens in an input sequence and occludes such tokens by a masking process. GradMask provides several advantages over existing methods including improved detection performance and an interpretation of its decision with a only moderate computational cost. Its approximated inference cost is no more than a single forward- and back-propagation through the target model without requiring any additional detection module. Extensive evaluation on widely adopted NLP benchmark datasets demonstrate the efficiency and effectiveness of GradMask.},
address = {Washington DC, USA},
author = {Han-Cheol Moon and Shafiq Joty and Xu Chi},
booktitle = {28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining},
publisher = {ACM},
series = {SIGKDD'22},
title = {GradMask: Gradient-Guided Token Masking for Textual Adversarial Example Detection},
url = {https://dl.acm.org/doi/abs/10.1145/3534678.3539206},
year = {2022}
}
@inproceedings{Jwala-acl-22,
abstract = {Given the claims of improved text generation quality across various pre-trained neural models, we consider the coherence evaluation of machine generated text to be one of the principal applications of coherence models that needs to be investigated. Prior work in neural coherence modeling has primarily focused on devising new architectures for solving the permuted document task. We instead use a basic model architecture and show significant improvements over state of the art within the same training regime. We then design a harder self-supervision objective by increasing the ratio of negative samples within a contrastive learning setup, and enhance the model further through automatic hard negative mining coupled with a large global negative queue encoded by a momentum encoder. We show empirically that increasing the density of negative samples improves the basic model, and using a global negative queue further improves and stabilizes the model while training with hard negative samples. We evaluate the coherence model on task-independent test sets that resemble real-world applications and show significant improvements in coherence evaluations of downstream tasks.},
address = {Online},
author = {Prathyusha Jwalapuram and Shafiq Joty and Xiang Lin},
booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'22},
title = {Rethinking Self-Supervision Objectives for Generalizable Coherence Modeling},
url = {https://arxiv.org/abs/2110.07198},
year = {2022}
}
@inproceedings{Chengwei-acl-22,
abstract = {Existing continual relation learning (CRL) methods rely on plenty of labeled training data for learning a new task, which can be hard to acquire in real scenario as getting large and representative labeled data is often expensive and time-consuming. It is therefore necessary for the model to learn novel relational patterns with very few labeled data while avoiding catastrophic forgetting of previous task knowledge. In this paper, we formulate this challenging yet practical problem as continual few-shot relation learning (CFRL). Based on the finding that learning for new emerging few-shot tasks often results in feature distributions that are incompatible with previous tasks' learned distributions, we propose a novel method based on embedding space regularization and data augmentation. Our method generalizes to new few-shot tasks and avoids catastrophic forgetting of previous tasks by enforcing extra constraints on the relational embeddings and by adding extra {relevant} data in a self-supervised manner. With extensive experiments we demonstrate that our method can significantly outperform previous state-of-the-art methods in CFRL task settings.},
address = {Online},
author = {Chengwei Qin and Shafiq Joty},
booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'22},
title = {Continual Few-shot Relation Learning via Embedding Space Regularization and Data Augmentation},
url = {https://arxiv.org/abs/2203.02135},
year = {2022}
}
@inproceedings{Ravaut-acl-22,
abstract = {Sequence-to-sequence neural networks have recently achieved great success in abstractive summarization, especially with the trend of fine-tuning large pre-trained language models on the downstream dataset. These models are typically decoded with beam search to generate a unique summary. However, the search space is very large, and due to exposure bias, such decoding is not optimal. In this paper, we show that it is possible to directly train a second-stage model performing re-ranking on a set of summary candidates. Our mixture-of-experts SummaReranker learns to select a better candidate and systematically improves the performance of the base model. With a base PEGASUS, we push ROUGE scores by 5.44% on CNN-DailyMail (47.16 ROUGE-1), 1.31% on XSum (48.12 ROUGE-1) and 9.34% on Reddit TIFU (29.83 ROUGE-1), reaching a new state-of-the-art.},
address = {Online},
author = {Mathieu Ravaut and Shafiq Joty and Nancy Chen},
booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'22},
title = {SummaReranker: A Multi-Task Mixture-of-Experts Re-Ranking Framework for Abstractive Summarization},
url = {https://arxiv.org/abs/2203.06569},
year = {2022}
}
@inproceedings{Shankar-acl-22,
abstract = {Charts are commonly used for exploring data and communicating insights. Generating natural language summaries from charts can be very helpful for people in inferring key insights that would otherwise require a lot of cognitive and perceptual efforts. We present Chart-to-text, a large-scale benchmark with two datasets and a total of 44,096 charts covering a wide range of topics and chart types. We explain the dataset construction process and analyze the datasets. We also introduce a number of state-of-the-art neural models as baselines that utilize image captioning and data-to-text generation techniques to tackle two problem variations: one assumes the underlying data table of the chart is available while the other needs to extract data from chart images. Our analysis with automatic and human evaluation shows that while our best models usually generate fluent summaries and yield reasonable BLEU scores, they also suffer from hallucinations and factual errors as well as difficulties in correctly explaining complex patterns and trends in charts.},
address = {Online},
author = {Shankar Kantharaj and Rixie Leong and Xiang Lin and Ahmed Masry and Megh Thakkar and Enamul Hoque and Shafiq Joty},
booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'22},
title = {Chart-to-Text: A Large-Scale Benchmark for Chart Summarization},
url = {https://arxiv.org/abs/2203.06486},
year = {2022}
}
@inproceedings{Bosheng-acl-22,
abstract = {Over the last few years, there has been a move towards data curation for multilingual task-oriented dialogue (ToD) systems that can serve people speaking different languages. However, existing multilingual ToD datasets either have a limited coverage of languages due to the high cost of data curation, or ignore the fact that dialogue entities barely exist in countries speaking these languages. To tackle these limitations, we introduce a novel data curation method that generates GlobalWoZ --- a large-scale multilingual ToD dataset globalized from an English ToD dataset for three unexplored use cases of multilingual ToD systems. Our method is based on translating dialogue templates and filling them with local entities in the target-language countries. Besides, we extend the coverage of target languages to 20 languages. We will release our dataset and a set of strong baselines to encourage research on multilingual ToD systems for real use cases.},
address = {Online},
author = {Bosheng Ding and Junjie Hu and Lidong Bing and Mahani Aljunied and Shafiq Joty and Luo Si and Chunyan Miao},
booktitle = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'22},
title = {GlobalWoZ: Globalizing MultiWoZ to Develop Multilingual Task-Oriented Dialogue Systems},
url = {https://arxiv.org/abs/2110.07679},
year = {2022}
}
@inproceedings{Ahmed-acl-22,
abstract = {Charts are very popular for analyzing data. When exploring charts, people often ask a variety of complex reasoning questions that involve several logical and arithmetic operations. They also commonly refer to visual features of a chart in their questions. However, most existing datasets do not focus on such complex reasoning questions as their questions are template-based and answers come from a fixed-vocabulary. In this work, we present a large-scale benchmark covering 9.6K human-written questions as well as 23.1K questions generated from human-written chart summaries. To address the unique challenges in our benchmark involving visual and logical reasoning over charts, we present two transformer-based models that combine visual features and the data table of the chart in a unified way to answer questions. While our models achieve the state-of-the-art results on the previous datasets as well as on our benchmark, the evaluation also reveals several challenges in answering complex reasoning questions.},
address = {Online},
author = {Ahmed Masry and Do Xuan Long and Jia Qing Tan and Shafiq Joty and Enamul Hoque},
booktitle = {Findings of the 60th Annual Meeting of the Association for Computational Linguistics},
publisher = {ACL},
series = {ACL'22 Findings},
title = {ChartQA: A Benchmark for Question Answering about Charts with Visual and Logical Reasoning},
year = {2022}
}
@inproceedings{Chengwei-et-al-arxiv-22,
abstract = {Existing approaches to lifelong language learning rely on plenty of labeled data for learning a new task, which is hard to obtain in most real scenarios. Considering that humans can continually learn new tasks from a handful of examples, we expect the models also to be able to generalize well on new few-shot tasks without forgetting the previous ones. In this work, we define this more challenging yet practical problem as Lifelong Few-shot Language Learning (LFLL) and propose a unified framework for it based on prompt tuning of T5. Our framework called LFPT5 takes full advantage of PT's strong few-shot learning ability, and simultaneously trains the model as a task solver and a data generator. Before learning a new domain of the same task type, LFPT5 generates pseudo (labeled) samples of previously learned domains, and later gets trained on those samples to alleviate forgetting of previous knowledge as it learns the new domain. In addition, a KL divergence loss is minimized to achieve label consistency between the previous and the current model. While adapting to a new task type, LFPT5 includes and tunes additional prompt embeddings for the new task. With extensive experiments, we demonstrate that LFPT5 can be applied to various different types of tasks and significantly outperform previous methods in different LFLL settings.},
author = {Chengwei Qin and Shafiq Joty},
booktitle = {International Conference on Learning Representations},
issue = {},
pages = {},
series = {ICLR-22},
title = {{LFPT5: A Unified Framework for Lifelong Few-shot Language Learning Based on Prompt Tuning of T5}},
url = {https://openreview.net/forum?id=HCRVf71PMF},
year = {2022}
}
@inproceedings{Phi-et-al-ICLR-22,
abstract = {Modern unsupervised machine translation systems mostly train their models
by generating synthetic parallel training data from large unlabeled monolingual corpora of different languages through various means, such as iterative backtranslation. However, there may exist small amount of actual parallel data hidden in the sea of unlabeled data, which has not been exploited. We develop a new fine-tuning objective, called Language-Agnostic Constraint for SwAV loss, or LAgSwAV, which enables a pre-trained model to extract such pseudo-parallel data from the monolingual corpora in a fully unsupervised manner. We then propose an effective strategy to utilize the obtained synthetic data to augment unsupervised machine translation. Our method achieves the state of the art in the WMT’14 English-French, WMT’16 German-English and English-Romanian bilingual unsupervised translation tasks, with 40.2, 36.8, and 37.0 BLEU, respectively. We also achieve substantial improvements in the FLoRes low-resource English-Nepali and English-Sinhala unsupervised tasks with 5.3 and 5.4 BLEU, respectively.},
author = {Xuan-Phi Nguyen and Hongyu Gong and Yun Tang and Changhan Wang and Philipp Koehn and Shafiq Joty},
booktitle = {International Conference on Learning Representations},
issue = {},
pages = {},
series = {ICLR-22},
title = {{Contrastive Clustering to Mine Pseudo Parallel Data for Unsupervised Translation}},
url = {https://openreview.net/pdf?id=pN1JOdrSY9},
year = {2022}
}
@inproceedings{Saha-aaai-2022,
abstract = {Neural Module Networks (NMNs) have been quite successful in incorporating explicit reasoning as learnable modules in various question answering tasks, including
the most generic form of numerical reasoning over text in Machine Reading Comprehension (MRC). However, to achieve this, contemporary NMNs need strong supervision in executing the query as a specialized program over reasoning modules and fail to generalize to more open-ended settings without such supervision. Hence we propose Weakly-Supervised Neuro-Symbolic Module Network (WNSMN) trained with answers as the sole supervision for numerical reasoning based MRC. It learns to execute a noisy heuristic program obtained from the dependency parsing of the query, as discrete actions over both neural and symbolic reasoning modules and trains it end-to-end in a reinforcement learning framework with discrete reward from answer matching. On the numerical-answer subset of DROP, WNSMN outperforms NMN by 32% and the reasoning-free language model GenBERT by 8% in exact match accuracy when trained under comparable weak supervised settings. This showcases the effectiveness and generalizability of modular networks that can handle explicit discrete reasoning over noisy programs in an end-to-end manner.},
address = {Vancouver, Canada},
author = {Amrita Saha and Shafiq Joty and Steven Hoi},
booktitle = {Thirty-Sixth AAAI Conference on Artificial Intelligence},
pages = {},
series = {AAAI'22},
title = {{Weakly Supervised Neuro-Symbolic Module Networks for Numerical Reasoning}},
url = {https://arxiv.org/pdf/2101.11802.pdf},
year = {2022}
}
@inproceedings{Long-coling-22,
abstract = {Conversational question generation (CQG) serves as a vital task for machines to assist humans, such as interactive reading comprehension, through conversations. Compared to traditional single-turn question generation (SQG), CQG is more challenging in the sense that the generated question is required not only to be meaningful, but also to align with the provided conversation. Previous studies mainly focus on how to model the flow and alignment of the conversation, but do not thoroughly study which parts of the context and history are necessary for the model. We believe that shortening the context and history is crucial as it can help the model to optimise more on the conversational alignment property. To this end, we propose CoHS-CQG, a two-stage CQG framework, which adopts a novel CoHS module to shorten the context and history of the input. In particular, it selects the top-p sentences and history turns by calculating the relevance scores of them. Our model achieves state-of-the-art performances on CoQA in both the answer-aware and answer-unaware settings.},
address = {Gyeongju, Republic of Korea},
author = {Xuan Long Do and
Bowei Zou and Liangming Pan and Nancy Chen and Shafiq Joty and Ai Ti Aw },
booktitle = {Proceedings of the 29th International Conference on Computational Linguistics},
month = {October},
pages = {xx--xx},
series = {COLING'22},
title = {CoHS-CQG: Context and History Selection for Conversational Question Generation},
url = {https://aclanthology.org/2022.coling-1.48.pdf},
year = {2022}
}
@inproceedings{linlin-coling-22,
abstract = {Cross-lingual word embeddings (CLWE) have been proven useful in many cross-lingual tasks. However, most existing approaches to learn CLWE including the ones with contextual embeddings are sense agnostic. In this work, we propose a novel framework to align contextual embeddings at the sense level by leveraging cross-lingual signal from bilingual dictionaries only. We operationalize our framework by first proposing a novel sense-aware cross entropy loss to model word senses explicitly. The monolingual ELMo and BERT models pretrained with our sense-aware cross entropy loss demonstrate significant performance improvement for word sense disambiguation tasks. We then propose a sense alignment objective on top of the sense-aware cross entropy loss for cross-lingual model pretraining, and pretrain cross-lingual models for several language pairs (English to German/Spanish/Japanese/Chinese). Compared with the best baseline results, our cross-lingual models achieve 0.52\%, 2.09\% and 1.29\% average performance improvements on zero-shot cross-lingual NER, sentiment classification and XNLI tasks, respectively. We will release our code.},
address = {Gyeongju, Republic of Korea},
author = {Linlin Liu and Thien Hai Nguyen and Shafiq Joty and Lidong Bing and Luo Si
},
booktitle = {Proceedings of the 29th International Conference on Computational Linguistics},
month = {October},
pages = {xx--xx},
series = {COLING'22},
title = {Towards Multi-Sense Cross-Lingual Alignment of Contextual Embeddings},
url = {https://aclanthology.org/2022.coling-1.386/},
year = {2022}
}
@inproceedings{Gao-aaai-2022,
abstract = {Research in image captioning has mostly focused on English because of the availability of image-caption paired datasets in this language. However, building vision-language systems only for English deprives a large part of the world population of AI technologies' benefit. On the other hand, creating image-caption paired datasets for every target language is expensive. In this work, we present a novel unsupervised cross-lingual method to generate image captions in a target language without using any image-caption corpus in the source or target languages. Our method relies on (i) a cross-lingual scene graph to sentence translation process, which learns to decode sentences in the target language from a cross-lingual encoding space of scene graphs using a sentence parallel (bitext) corpus, and (ii) an unsupervised cross-modal feature mapping which seeks to map an encoded scene graph features from image modality to language modality. We verify the effectiveness of our proposed method on the Chinese image caption generation task. The comparisons against several existing methods demonstrate the effectiveness of our approach.},
address = {Vancouver, Canada},
author = {Jiahui Gao and Yi Zhou and Philip Yu and Shafiq Joty and Jiuxiang Gu},
booktitle = {Thirty-Sixth AAAI Conference on Artificial Intelligence},
pages = {},
series = {AAAI'22},
title = {{Unsupervised Cross-lingual Image Captioning}},
url = {https://arxiv.org/abs/2010.01288},
year = {2022}
}
@inproceedings{chen-et-al-sigmod-22-demo,
abstract = {The database systems course in an undergraduate computer science degree program is gaining increasing importance due to the con- tinuous supply of database-related jobs as well as the rise of Data Science. A key learning goal of learners taking such a course is to understand how sql queries are executed in an rdbms in practice. Existing rdbms typically expose a query execution plan (qep) in visual or textual format, which describes the execution steps for a given query. However, it is often daunting for a learner to compre- hend these qeps containing vendor-specific implementation details. In this demonstration, we present a novel, generic, and portable system called lantern that generates a natural language-based description of the execution strategy chosen by the underlying rdbms to process a query. It provides a declarative framework called pool for subject matter experts (sme) to efficiently create and ma- nipulate natural language descriptions of physical operators of any rdbms. It then exploits pool to generate nl description of a qep by integrating rule-based and deep learning-based techniques to infuse language variability in the descriptions. Such nl generation strategy mitigates the impact of boredom on learners caused by repeated exposure of similar text generated by rule-based systems.},
address = {Philadelphia, PA, USA},
author = {Peng Chen and Hui Li and Sourav Bhowmick and Shafiq Joty and Weiguo Wang},
booktitle = {Proceedings of 2022 ACM SIGMOD International Conference on Management of Data (Demo)},
month = {June},
pages = {x -- x},
publisher = {ACM},
series = {SIGMOD'22 (Demo)},
title = {LANTERN: Boredom-conscious Natural Language Description Generation of Query Execution Plans for Database Education},
url = {papers/chen-et-al-sigmod-22-demo.pdf},
year = {2022}
}
@article{li-et-al-piano-ieee-2022,
abstract = {Since its introduction in 2003, the influence maximization (IM) problem has drawn significant research attention in the literature. The aim of IM, which is NP-hard, is to select a set of k users known as seed users who can influence the most individuals in the social network. The state-of-the-art algorithms estimate the expected influence of nodes based on sampled diffusion paths. As the number of required samples have been recently proven to be lower bounded by a particular threshold that presets tradeoff between the accuracy and efficiency, the result quality of these traditional solutions is hard to be further improved without sacrificing efficiency. In this paper, we present an orthogonal and novel paradigm to address the IM problem by leveraging deep reinforcement learning to estimate the expected influence. Specifically, we present a novel framework called PIANO that incorporates network embedding and reinforcement learning techniques to address this problem. In order to make it practical, we further present PIANO-E and PIANO@⟨d⟩, both of which can be applied directly to answer IM without training the model from scratch. Experimental study on real-world networks demonstrates that PIANO achieves the best performance w.r.t efficiency and influence spread quality compared to state-of-the- art classical solutions. We also demonstrate that the learned parametric models generalize well across different networks. Besides, we provide a pool of pretrained PIANO models such that any IM task can be addressed by directly applying a model from the pool without training over the targeted network.},
author = {Hui Li and Mengting Xu and Sourav Bhowmick and Shafiq Joty and Changsheng Sun and Jiangtao Cui},
journal = {IEEE Transactions on Computational Social Systems (IEEE TCSS)},
publisher = {IEEE CS Press},
title = {PIANO: Influence Maximization Meets Deep Reinforcement Learning},
url = {papers/li-et-al-piano-ieee-2022.pdf},
year = {2022}
}
@inproceedings{joty-etal-2022-vis,
abstract = {This tutorial will provide an introduction to natural language processing (NLP) to interested researchers in the visualization (Vis) community. It will first motivate why NLP4Vis is an important area of research and provide an overview of research topics on combining NLP and Vis techniques. Then an overview of deep learning models for NLP will be covered. A particular focus will be provided on highlighting the recent progress on large language models such as ChatGPT and how such models can be leveraged to solve various NLP tasks for visualizations. In the final part, we will focus on various application tasks at the intersection of NLP and Vis. We will conclude with an interactive discussion of future challenges for NLP+Vis applications. The audience will include researchers interested in applying NLP for visualizations as well as others who focus more generally at the intersection of AI and visualization.},
author = {Shafiq Joty and Enamul Hoque},
booktitle = {Proceedings of the 2022 IEEE Vis Conference},
publisher = {IEEE},
series = {IEEE Vis'22},
title = {NLP4Vis: Natural Language Processing for Information Visualization},
url = {https://virtual.ieeevis.org/year/2022/session_t-nlp4vis-2.html},
year = {2022}
}
@inproceedings{Junnan-et-al-nips-21,
abstract = {Large-scale vision and language representation learning has shown promising improvements on various vision-language tasks. Most existing methods employ a transformer-based multimodal encoder to jointly model visual tokens (region-based image features) and word tokens. Because the visual tokens and word tokens are unaligned, it is challenging for the multimodal encoder to learn image-text interactions. In this paper, we introduce a contrastive loss to ALign the image and text representations BEfore Fusing (ALBEF) them through cross-modal attention, which enables more grounded vision and language representation learning. Unlike most existing methods, our method does not require bounding box annotations nor high-resolution images. In order to improve learning from noisy web data, we propose momentum distillation, a self-training method which learns from pseudo-targets produced by a momentum model. We provide a theoretical analysis of ALBEF from a mutual information maximization perspective, showing that different training tasks can be interpreted as different ways to generate views for an image-text pair. ALBEF achieves state-of-the-art performance on multiple downstream vision-language tasks. On image-text retrieval, ALBEF outperforms methods that are pre-trained on orders of magnitude larger datasets. On VQA and NLVR2, ALBEF achieves absolute improvements of 2.37\% and 3.84\% compared to the state-of-the-art, while enjoying faster inference speed. Code and pre-trained models are available at .},
address = {Online},
author = {Junnan Li and
Ramprasaath R. Selvaraju and Akhilesh Deepak Gotmare and Shafiq Joty and Caiming Xiong and Steven Hoi},
booktitle = {2021 Conference on Neural Information Processing Systems},
series = {NeurIPS'21 (spotlight ~3%)},
title = {Align before Fuse: Vision and Language Representation Learning with
Momentum Distillation},
url = {https://arxiv.org/abs/2107.07651},
year = {2021}
}
@inproceedings{lin-et-al-arxiv-21,
abstract = {Advanced large-scale neural language models have led to significant success in many language generation tasks. However, the most commonly used training objective, Maximum Likelihood Estimation (MLE), has been shown problematic, where the trained model prefers using dull and repetitive phrases. In this work, we introduce {ScaleGrad}, a modification straight to the gradient of the loss function, to remedy the degeneration issue of the standard MLE objective. By directly maneuvering the gradient information, {ScaleGrad} makes the model learn to use novel tokens. Empirical results show the effectiveness of our method not only in open-ended generation, but also in directed generation tasks. With the simplicity in architecture, our method can serve as a general training objective that is applicable to most of the neural text generation tasks.},
address = {Virtual},
author = {Xiang Lin and Simeng Han and Shafiq Joty},
booktitle = {Thirty-eighth International Conference on Machine Learning},
numpages = {9},
publisher = {},
series = {ICML'21 (as long talk ~3%)},
title = {{Straight to the Gradient: Learning to Use Novel Tokens for Neural Text Generation}},
url = {http://proceedings.mlr.press/v139/lin21b.html},
year = {2021}
}
@inproceedings{nguyen2020multiagent,
abstract = {Recent unsupervised machine translation (UMT) systems usually employ three main principles: initialization, language modeling and iterative back-translation, though they may apply these principles differently. This work introduces another component to this framework: Multi-Agent Cross-translated Diversification (MACD). The method trains multiple UMT agents and then translates monolingual data back and forth using non-duplicative agents to acquire synthetic parallel data for supervised MT. MACD is applicable to all previous UMT approaches. In our experiments, the technique boosts the performance for some commonly used UMT methods by 1.5-2.0 BLEU. In particular, in WMT'14 English-French, WMT'16 German-English and English-Romanian, MACD outperforms cross-lingual masked language model pretraining by 2.3, 2.2 and 1.6 BLEU, respectively. It also yields 1.5-3.3 BLEU improvements in IWSLT English-French and English-German translation tasks. Through extensive experimental analyses, we show that MACD is effective because it embraces data diversity while other similar variants do not.},
address = {Virtual},
author = {Xuan-Phi Nguyen and Shafiq Joty and Thanh-Tung Nguyen and Wu Kui and Ai Ti Aw},
booktitle = {Thirty-eighth International Conference on Machine Learning},
numpages = {9},
publisher = {},
series = {ICML'21},
title = {Cross-model Back-translated Distillation for Unsupervised Machine Translation},
url = {http://proceedings.mlr.press/v139/nguyen21c/nguyen21c.pdf},
year = {2021}
}
@inproceedings{Yue-emnlp-21,
abstract = {Pre-trained models for Natural Languages (NL) like BERT and GPT have been recently shown to transfer well to Programming Languages (PL) and largely benefit a broad set of code-related tasks. Despite their success, most current methods either rely on an encoder-only (or decoder-only) pre-training that is suboptimal for generation (resp. understanding) tasks or process the code snippet in the same way as NL, neglecting the special characteristics of PL such as token types. We present CodeT5, a unified pre-trained encoder-decoder Transformer model that better leverages the code semantics conveyed from the developer-assigned identifiers. Our model is unified in that it builds on a unified framework to seamlessly support both code understanding and generation tasks, and it employs a unified format of task control codes to allow for multi-task learning. We propose a novel identifier-aware pre-training objective that enables the model to distinguish which code tokens are identifiers and to recover them when they are masked. To further close the gap between the pre-training and fine-tuning, we propose a bimodal dual generation task to encourage the alignment between NL and PL. Comprehensive experiments show that CodeT5 significantly outperforms prior methods on understanding tasks such as code defect detection and clone detection, and generation tasks across various directions including PL-NL, NL-PL, and PL-PL.
Further analysis reveals that our model can better capture semantic information from code.},
address = {Online},
author = {Yue Wang and Weishi Wang and Shafiq Joty and Steven Hoi},
booktitle = {the 2021 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'21},
title = {CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation},
url = {https://aclanthology.org/2021.emnlp-main.685/},
year = {2021}
}
@inproceedings{Tao-emnlp-21,
abstract = {Large scale multilingual pre-trained language models have shown promising results in zero- and few-shot cross-lingual tasks. However, recent studies have shown their lack of generalizability when the languages are structurally dissimilar. In this work, we propose a novel fine-tuning method based on co-training that aims to learn more generalized semantic equivalences as complementary to multilingual language modeling using the unlabeled data in the target language. We also propose an adaption method based on contrastive learning to better capture the semantic relationship in the parallel data, when a few translation pairs are available. To show our method's effectiveness, we conduct extensive experiments on cross-lingual inference and review classification tasks across various languages. We report significant gains compared to directly fine-tuning multilingual pre-trained models and other semi-supervised alternatives.\footnote{Code and models are available at \scriptsize{\urlstyle{tt}\url{}}}.},
address = {Online},
author = {Tao Yu and Shafiq Joty},
booktitle = {the 2021 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'21},
title = {Effective Fine-tuning Methods for Cross-lingual Adaptation},
url = {https://aclanthology.org/2021.emnlp-main.668/},
year = {2021}
}
@inproceedings{Yingzhu-et-al-emnlp-21,
abstract = {Transformer models have been used in automatic speech recognition (ASR) successfully and yields state-of-the-art results. However, its performance is still affected by speaker mismatch between training and test data. Further finetuning a trained model with target speaker data is the most natural approach for adaptation, but it takes a lot of compute and may cause catastrophic forgetting to the existing speakers. In this work, we propose a unified speaker adaptation approach consisting of feature adaptation and model adaptation. For feature adaptation, we employ a speaker-aware persistent memory model which generalizes better to unseen test speakers by making use of speaker i-vectors to form a persistent memory. For model adaptation, we use a novel gradual pruning method to adapt to target speakers without changing the model architecture, which to the best of our knowledge, has never been explored in ASR. Specifically, we gradually prune less contributing parameters on model encoder to a certain sparsity level, and use the pruned parameters for adaptation, while freezing the unpruned parameters to keep the original model performance. We conduct experiments on the Librispeech dataset. Our proposed approach brings relative 2.74-6.52\% word error rate (WER) reduction on general speaker adaptation. On target speaker adaptation, our method outperforms the baseline with up to 20.1\% relative WER reduction, and surpasses the finetuning method by up to relative 8.62\%. Besides, with extremely low-resource adaptation data (e.g., 1 utterance), our method could improve the WER by relative 6.53\% with only a few epochs of training.},
address = {Online},
author = {Yingzhu Zhao and Chongjia Ni and Cheung-Chi LEUNG and Shafiq Joty and Eng Siong Chng and Bin Ma},
booktitle = {the 2021 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'21},
title = {A Unified Speaker Adaptation Approach for ASR},
url = {https://aclanthology.org/2021.emnlp-main.737/},
year = {2021}
}
@inproceedings{ben-et-al-arxiv-20,
abstract = {Class-conditional language models (CC-LMs) can be used to generate natural language with specific attributes, such as style or sentiment, by conditioning on an attribute label, or control code. However, we find that these models struggle to control generation when applied to out-of-domain prompts or unseen control codes. To overcome these limitations, we propose generative discriminator (GeDi) guided contrastive generation, which uses CC-LMs as generative discriminators (GeDis) to efficiently guide generation from a (potentially much larger) LM towards a desired attribute. In our human evaluation experiments, we show that GeDis trained for sentiment control on movie reviews are able to control the tone of book text. We also demonstrate that GeDis are able to detoxify generation and control topic while maintaining the same level of linguistic acceptability as direct generation from GPT-2 (1.5B parameters). Lastly, we show that a GeDi trained on only 4 topics can generalize to new control codes from word embeddings, allowing it to guide generation towards wide array of topics.},
address = {Online},
author = {Ben Krause and Akhilesh Deepak Gotmare and Bryan McCann and Nitish Shirish Keskar and Shafiq Joty and Richard Socher and Nazneen Fatema Rajani},
booktitle = {the 2021 Conference on Empirical Methods in Natural Language Processing},
publisher = {ACL},
series = {EMNLP'21 Findings},
title = {GeDi: Generative Discriminator Guided Sequence Generation},
url = {https://arxiv.org/pdf/2009.06367.pdf},
year = {2021}
}
@inproceedings{bari-et-al-arxiv-20,
abstract = {Transfer learning has yielded state-of-the-art results in many supervised natural language processing tasks. However, annotated data for every target task in every target language is rare, especially for low-resource languages. In this work, we propose MultiMix, a novel data augmentation method for semi-supervised learning in zero-shot transfer learning scenarios. In particular, MultiMix targets to solve cross-lingual adaptation problems from a source (language) distribution to an unknown target (language) distribution assuming it has no training labels in the target language task. In its heart, MultiMix performs simultaneous self-training with data augmentation and unsupervised sample selection. To show its effectiveness, we have performed extensive experiments on zero-shot transfers for cross-lingual named entity recognition (XNER) and natural language inference (XNLI). Our experiments show sizeable improvements in both tasks outperforming the baselines by a good margin.},
address = {Bangkok, Thailand},
author = {M Saiful Bari and Tasnim Mohiuddin and Shafiq Joty},
booktitle = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {1978–-1992},
publisher = {ACL},
series = {ACL'21},
title = {{UXLA: A Robust Unsupervised Data Augmentation Framework for Cross-Lingual NLP}},
url = {},
year = {2021}
}
@inproceedings{Nguyen-et-al-acl-21,
abstract = {We introduce a generic seq2seq parsing framework that casts constituency parsing problems (syntactic and discourse parsing) into a series of conditional splitting decisions. Our parsing model estimates the conditional probability distribution of possible splitting points in a given text span and supports efficient top-down decoding, which is linear in number of nodes. The conditional splitting formulation together with efficient beam search inference facilitate structural consistency without relying on expensive structured inference. Crucially, for discourse analysis we show that in our formulation, discourse segmentation can be framed as a special case of parsing which allows us to perform discourse parsing without requiring segmentation as a pre-requisite. Experiments show that our model achieves good results on the standard syntactic parsing tasks under settings with/without pre-trained representations and rivals state-of-the-art (SoTA) methods that are more computationally expensive than ours. In discourse parsing, our method outperforms SoTA by a good margin. Our source code will be publicly available.},
address = {Bangkok, Thailand},
author = {Thanh-Tung Nguyen and Xuan-Phi Nguyen and Shafiq Joty and Xiaoli Li},
booktitle = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {xx–-xx},
publisher = {ACL},
series = {ACL'21},
title = {A Conditional Splitting Framework for Efficient Constituency Parsing},
url = {https://aclanthology.org/2021.acl-long.450/},
year = {2021}
}
@inproceedings{Tan-et-al-acl-21,
abstract = {Questions of fairness, robustness, and transparency are paramount to address before deploying NLP systems. Central to these concerns is the question of reliability: Can NLP systems reliably treat different demographics fairly \emph{and} function correctly in diverse and noisy environments? To address this, we argue for the need for reliability testing and contextualize it among existing work on improving accountability. We show how adversarial attacks can be reframed for this goal, via a framework for developing reliability tests. We argue that reliability testing --- with an emphasis on interdisciplinary collaboration --- will enable rigorous and targeted testing, and aid in the enactment and enforcement of industry standards.},
address = {Bangkok, Thailand},
author = {Samson Tan and Shafiq Joty and Kathy Baxter and Araz Taeihagh and Gregory A. Bennett and Min-Yen Kan},
booktitle = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {4153–4169},
publisher = {ACL},
series = {ACL'21},
title = {Reliability Testing for Natural Language Processing Systems},
url = {https://aclanthology.org/2021.acl-long.321/},
year = {2021}
}
@inproceedings{linlin-et-al-acl-21,
abstract = {Named Entity Recognition (NER) for low-resource languages is a both practical and challenging research problem. This paper addresses zero-shot transfer for cross-lingual NER, especially when the amount of source-language training data is also limited. The paper first proposes a simple but effective labeled sequence translation method to translate source-language training data to target languages and avoids problems such as word order change and entity span determination. With the source-language data as well as the translated data, a generation-based multilingual data augmentation method is introduced to further increase diversity by generating synthetic labeled data in multiple languages. These augmented data enable the language model based NER models to generalize better with both the language-specific features from the target-language synthetic data and the language-independent features from multilingual synthetic data. An extensive set of experiments were conducted to demonstrate encouraging cross-lingual transfer performance of the new research on a wide variety of target languages. The code and data in this work will be made public for the research community.},
address = {Bangkok, Thailand},
author = {Linlin Liu and Bosheng Ding and Lidong Bing and Shafiq Joty and Luo Si and Chunyan Miao},
booktitle = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {xx–-xx},
publisher = {ACL},
series = {ACL'21},
title = {MulDA: A Multilingual Data Augmentation Framework for Low-Resource Cross-Lingual NER},
url = {https://aclanthology.org/2021.acl-long.453/},
year = {2021}
}
@inproceedings{Mohiuddin-et-al-acl-21,
abstract = {The success of Neural Machine Translation (NMT) largely depends on the availability of large bitext training corpora. Due to the lack of such large corpora in low-resource language pairs, NMT systems often exhibit poor performance. Extra relevant monolingual data often helps, but acquiring it could be quite expensive, especially for low-resource languages. Moreover, domain mismatch between bitext (train/test) and monolingual data might degrade the performance. To alleviate such issues, we propose AugVic, a novel data augmentation framework for low-resource NMT which exploits the vicinal samples of the given bitext without using any extra monolingual data explicitly. It can diversify the in-domain bitext data with finer level control. Through extensive experiments on four low-resource language pairs comprising data from different domains, we have shown that our method is comparable to the traditional back-translation that uses extra in-domain monolingual data. When we combine the synthetic parallel data generated from AugVic with the ones from the extra monolingual data, we achieve further improvements. We show that AugVic helps to attenuate the discrepancies between relevant and distant-domain monolingual data in traditional back-translation. To understand the contributions of different components of AugVic, we perform an in-depth framework analysis.},
address = {Bangkok, Thailand},
author = {Tasnim Mohiuddin and M Saiful Bari and Shafiq Joty},
booktitle = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {xx–-xx},
publisher = {ACL},
series = {ACL'21 Findings},
title = {AugVic: Exploiting BiText Vicinity for Low-Resource NMT},
url = {https://arxiv.org/abs/2106.05141},
year = {2021}
}
@inproceedings{samson-naacl-21,
abstract = {Multilingual models have demonstrated impressive cross-lingual transfer performance. However, test sets like XNLI are monolingual at the example level. In multilingual communities, it is common for polyglots to code-mix when conversing with each other. Inspired by this phenomenon, we present two strong black-box adversarial attacks (one word-level, one phrase-level) for multilingual models that push their ability to handle code-mixed sentences to the limit. The former uses bilingual dictionaries to propose perturbations and translations of the clean example for sense disambiguation. The latter directly aligns the clean example with its translations before extracting phrases as perturbations. Our phrase-level attack has a success rate of 89.75\% against XLM-R$_\text{large}$, bringing its average accuracy of 79.85 down to 8.18 on XNLI. Finally, we propose an efficient adversarial training scheme that trains in the same number of steps as the original model and show that it improves model accuracy.},
address = {Mexico City, Mexico},
author = {Samson Tan and Shafiq Joty},
booktitle = {Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
numpages = {9},
pages = {xx–-xx},
publisher = {ACL},
series = {NAACL'21},
title = {Code-Mixing on Sesame Street: Dawn of the Adversarial Polyglots},
url = {https://aclanthology.org/2021.naacl-main.282.pdf},
year = {2021}
}
@inproceedings{nguyen-naacl-21,
abstract = {We introduce a novel top-down end-to-end formulation of document level discourse parsing in the Rhetorical Structure Theory (RST) framework. In this formulation, we consider discourse parsing as a sequence of splitting decisions at token boundaries and use a seq2seq network to model the splitting decisions. Our framework facilitates discourse parsing from scratch without requiring discourse segmentation as a prerequisite; rather, it yields segmentation as part of the parsing process. Our unified parsing model adopts a beam search to decode the best tree structure by searching through a space of high scoring trees. With extensive experiments on the standard RST discourse treebank, we demonstrate that our parser outperforms existing methods by a good margin in both end-to-end parsing and parsing with gold segmentation. More importantly, it does so without using any handcrafted features, making it faster and easily adaptable to new languages and domains.},
address = {Mexico City, Mexico},
author = {Thanh-Tung Nguyen and Xuan-Phi Nguyen and Shafiq Joty and Xiaoli Li},
booktitle = {Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
numpages = {9},
pages = {xx–-xx},
publisher = {ACL},
series = {NAACL'21},
title = {RST Parsing from Scratch},
url = {https://arxiv.org/abs/2105.10861},
year = {2021}
}
@inproceedings{alex-naacl-21,
abstract = {Models pretrained with self-supervised objectives on large text corpora achieve state-of-the-art performance on text summarization tasks. However, these models are typically fine-tuned on hundreds of thousands of data points, an infeasible requirement when applying summarization to new, niche domains. In this work, we introduce a general method, called WikiTransfer, for fine-tuning pretrained models for summarization in an unsupervised, dataset-specific manner which makes use of characteristics of the target dataset such as the length and abstractiveness of the desired summaries. We achieve state-of-the-art, zero-shot abstractive summarization performance on the CNN-DailyMail dataset and demonstrate the effectiveness of our approach on three additional, diverse datasets. The models fine-tuned in this unsupervised manner are more robust to noisy data and also achieve better few-shot performance using 10 and 100 training examples. We perform ablation studies on the effect of the components of our unsupervised fine-tuning data and analyze the performance of these models in few-shot scenarios along with data augmentation techniques using both automatic and human evaluation.},
address = {Mexico City, Mexico},
author = {Alexander Fabbri and Simeng Han and Haoyuan Li and Haoran Li and Marjan Ghazvininejad and Shafiq Joty and Dragomir Radev and Yashar Mehdad},
booktitle = {Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
numpages = {9},
pages = {xx–-xx},
publisher = {ACL},
series = {NAACL'21},
title = {Improving Zero and Few-Shot Abstractive Summarization with Intermediate Fine-tuning and Data Augmentation},
url = {https://arxiv.org/abs/2010.12836},
year = {2021}
}
@inproceedings{weiwen-naacl-21,
abstract = {Neural Machine Translation (NMT) has achieved significant breakthrough in performance but is known to suffer vulnerability to input perturbations. As real input noise is difficult to predict during training, robustness is a big issue for system deployment. In this paper, we improve the robustness of NMT models by reducing the effect of noisy words through a Context-Enhanced Reconstruction (CER) approach. CER trains the model to resist noise in two steps: (1) perturbation step that breaks the naturalness of input sequence with made-up words; (2) reconstruction step that defends the noise propagation by generating better and more robust contextual representation. Experimental results on Chinese-English (ZH-EN) and French-English (FR-EN) translation tasks demonstrate robustness improvement on both news and social media text. Further fine-tuning experiments on social media text show our approach can converge at a higher position and provide a better adaptation.},
address = {Mexico City, Mexico},
author = {Weiwen Xu and AiTi Aw and Yang Ding and Kui Wu and Shafiq Joty},
booktitle = {Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Industry Track)},
numpages = {8},
pages = {xx–-xx},
publisher = {ACL},
series = {NAACL'21},
title = {Addressing the Vulnerability of NMT in Input Perturbations},
url = {},
year = {2021}
}
@inproceedings{mohiuddin-coh-et-al-arxiv-20,
abstract = {Although coherence modeling has come a long way in developing novel models, their evaluation on downstream applications for which they are purportedly developed has largely been neglected. With the advancements made
by neural approaches in applications such as machine translation (MT), summarization and dialog systems, the need for coherence evaluation of these tasks is now more crucial than ever. However, coherence models are typically evaluated only on synthetic tasks, which may not be representative of their performance in downstream applications. To investigate how representative the synthetic tasks are of downstream use cases, we conduct experiments on benchmarking well-known traditional and neural coherence models on synthetic sentence ordering tasks, and contrast this with their performance on three downstream applications: coherence evaluation for MT and summarization, and next utterance prediction in retrieval-based dialog. Our results demonstrate a weak correlation between the model performances in the synthetic tasks and the downstream applications, motivating alternate training and evaluation methods for coherence models},
address = {Kyiv},
author = {Tasnim Mohiuddin* and Prathyusha Jwalapuram* and Xiang Lin* and Shafiq Joty*},
booktitle = {Proceedings of the European Chapter of the ACL},
numpages = {9},
pages = {x–-x},
publisher = {ACL},
series = {EACL'21},
title = {{Rethinking Coherence Modeling: Synthetic vs. Downstream Tasks}},
url = {https://arxiv.org/abs/2004.14626},
year = {2021}
}
@inproceedings{li-cikm21,
abstract = {We study the task of span-level emotion cause analysis (SECA), which is focused on identifying the specific emotion cause span(s) triggering a certain emotion in the text. Compared to the popular clause-level emotion cause analysis (CECA), it is a finer-grained emotion cause analysis (ECA) task. Existing SECA method relies on the manually engineered features, which is labor-intensive and not generalized well. In this paper, we design a BERT-based graph attention network for emotion cause span(s) identification. The proposed model takes advantage the structure of BERT to capture the relationship information between emotion and text, and utilizes graph attention network to model the structure information of the text. Our SECA method can be easily used for extracting clause-level emotion causes for CECA as well. Experimental results show that the proposed method consistently outperform the state-of-the-art ECA methods on benchmark emotion cause dataset.},
address = {Online},
author = {Xiangju Li and Wei Gao and Shi Feng and Wang Daling and Shafiq Joty},
booktitle = {Proceedings of The 30th ACM International Conference on Information and Knowledge Management},
month = {November},
pages = {xx--xx},
publisher = {ACM},
series = {CIKM'21 (short paper)},
title = {Span-Level Emotion Cause Analysis by BERT-based Graph Attention Network},
url = {},
year = {2021}
}
@inproceedings{li-cikm21b,
abstract = {We study the task of span-level Emotion cause analysis (SECA), which is focused on extracting the specific emotion cause span(s) for a certain emotion expressed in the given context. Compared to popular clause-level emotion cause analysis (CECA), it is a finer-grained emotion cause analysis (ECA) task. The existing SECA method heavily dependents on the effectiveness of designed features, which is labor-intensive and not generalized well. In this paper, we formalize SECA as a sequence tagging task for which several variants of neural network-based sequence tagging models to extract specific emotion cause span(s) in the given context. These models combine different types of encoding and decoding approaches. Furthermore, to make our models more ``emotionally sensitive'', we utilize the multi-head attention mechanism to enhance the representation of context. Experimental evaluations conducted on two benchmark datasets show that our proposed models create new state-of-the-art results. Our work is the first using neural sequence tagging method for span-level ECA.},
address = {Online},
author = {Xiangju Li and Wei Gao and Shi Feng and Wang Daling and Shafiq Joty},
booktitle = {Proceedings of The 30th ACM International Conference on Information and Knowledge Management},
month = {November},
pages = {xx--xx},
publisher = {ACM},
series = {CIKM'21 (short paper)},
title = {Span-level Emotion Cause Analysis with Neural Sequence Tagging},
url = {},
year = {2021}
}
@inproceedings{zhao-et-al-icassp-21,
abstract = {With the recent development of end-to-end models in speech recognition, there have been more interests in adapting these models for online speech recognition. However, using end- to-end models for online speech recognition is known to suf- fer from an early endpointing problem, which brings in many deletion errors. In this paper, we propose to address the early endpointing problem from the gradient perspective. Specifi- cally, we leverage on the recently proposed ScaleGrad tech- nique, which was proposed to mitigate the text degeneration issue. Different from ScaleGrad, we adapt it to discourage the early generation of the end-of-sentence () token. A scaling term is added to directly maneuver the gradient of the training loss to encourage the model to learn to keep generating non- tokens. Compared with previous ap- proaches such as voice-activity-detection and end-of-query detection, the proposed method does not rely on various types of silence, and it also saves the trouble from obtaining the ground truth endpoint with forced alignment. Nevertheless, it can be jointly applied with other techniques. Experiments on AISHELL-1 dataset show that our model brings relative 5.4%-10.1% CER reductions over the baseline, and surpasses the unlikelihood training method which directly reduces the generation probability of token.},
address = {Brighton, UK},
author = {Yingzhu Zhao and Chongjia Ni and Cheung-Chi Leung and Shafiq Joty and Eng Siong Chng and Bin Ma,},
booktitle = {International Conference on Acoustics, Speech, and Signal Processing},
pages = {xx -- xx},
publisher = {IEEE},
series = {ICASSP'21},
title = {Preventing Early Endpointing for Online Automatic Speech Recognition},
url = {https://2021.ieeeicassp.org/Papers/AcceptedPapers.asp},
year = {2021}
}
@inproceedings{wang-et-al-sigmod-21,
abstract = {The widespread usage of rdbms in the commercial world has played a pivotal role in the offering of database systems course in major universities. A key challenge encountered by learners taking such a course is the topic of query optimization. The query optimization process produces a query execution plan (qep), which represents an execution strategy for an sql query. Unfortunately, in practice, it is often difficult for a learner to comprehend query execution strategies by perusing vendor-specific qeps, hindering her learning process. In this paper, we present a novel, end-to-end, generic sys- tem called lantern that generates a natural language description of a qep to enhance its understanding. It takes as input an sql query and its qep, and generates a natural language description of the execution strategy deployed by the underlying rdbms. Specifi- cally, it deploys a declarative framework called pool that enables subject matter experts to efficiently create and maintain natural language descriptions of physical operators used in qeps. A rule- based framework called rule-lantern is proposed that exploits pool to generate natural language descriptions of qeps. Despite the high accuracy of rule-lantern, our engagement with learners reveal that consistent with existing psychology theories perusing such rule-based descriptions lead to boredom due to repetitive state- ments across different qeps. To address this issue, we present a novel deep learning-based language generation framework called neural-lantern that infuses language variability in the gener- ated description by exploiting a set of paraphrasing tools and word embedding. Our experimental study with real learners shows the effectiveness of lantern in facilitating comprehension of qeps.},
address = {Xi'an, Shaanxi, China},
author = {Weiguo Wang and Sourav S Bhowmick and Hui Li and Shafiq Joty and Siyuan Liu},
booktitle = {Proceedings of 2021 ACM SIGMOD International Conference on Management of Data},
month = {June},
pages = {x -- x},
publisher = {ACM},
series = {SIGMOD'21},
title = {Towards Enhancing Database Education: Natural Language Generation Meets Query Execution Plans},
url = {},
year = {2021}
}
@inproceedings{phi-et-al-arxiv-19,
abstract = {A common approach to improve neural machine translation is to invent new architectures. However, the research process of designing and refining such new models is often exhausting. Another approach is to resort to huge extra monolingual data to conduct semi-supervised training, like back-translation. But extra monolingual data is not always available, especially for low resource languages. In this paper, we propose to diversify the available training data by using multiple forward and backward peer models to augment the original training dataset. Our method does not require extra data like back-translation, nor additional computations and parameters like using pretrained models. Our data diversification method achieves state-of-the-art BLEU score of 30.7 in the WMT'14 English-German task. It also consistently and substantially improves translation quality in 8 other translation tasks: 4 IWSLT tasks (English-German and English-French) and 4 low-resource translation tasks (English-Nepali and English-Sinhala).},
address = {Vancouver, Canada},
author = {Xuan-Phi Nguyen and Shafiq Joty and Wu Kui and Ai Ti Aw},
booktitle = {2020 Conference on Neural Information Processing Systems},
series = {NeurIPS'20},
title = {{Data Diversification: An Elegant Strategy for Neural Machine Translation}},
url = {https://proceedings.neurips.cc/paper/2020/file/7221e5c8ec6b08ef6d3f9ff3ce6eb1d1-Paper.pdf},
year = {2020}
}
@inproceedings{Gu-et-al-nips-20,
abstract = {Structured representations of images according to visual relationships are beneficial for many vision and vision-language applications. However, current human-annotated visual relationship datasets suffer from the long-tailed predicate distribution problem which limits the potentials of visual relationship models. In this work, we introduce a self-supervised method that implicitly learns the visual relationships without relying on any ground-truth visual relationship annotations. Our method relies on 1) intra- and inter-modality encodings to respectively model relationships within each modality separately and jointly, and 2) relationship probing, which seeks to discover the graph structure within each modality. By leveraging masked language modeling, contrastive learning, and dependency tree distances for self-supervision, our method can learn better object features as well as implicit visual relationships. We verify the effectiveness of our proposed method on various vision-language tasks that benefit from improved visual relationship understanding.},
address = {Vancouver, Canada},
author = {Jiuxiang Gu and Jason Kuen and Shafiq Joty and Jianfei Cai and Vlad Morariu and Handong Zhao and Tong Sun},
booktitle = {2020 Conference on Neural Information Processing Systems},
series = {NeurIPS'20},
title = {{Self-Supervised Relationship Probing}},
url = {https://papers.nips.cc/paper/2020/file/13f320e7b5ead1024ac95c3b208610db-Paper.pdf},
year = {2020}
}
@inproceedings{mohiuddin-et-al-arxiv-20,
abstract = {Most of the successful and predominant methods for bilingual lexicon induction (BLI) are mapping-based, where a linear mapping function is learned with the assumption that the word embedding spaces of different languages exhibit similar geometric structures (i.e., approximately isomorphic). However, several recent studies have criticized this simplified assumption showing that it does not hold in general even for closely related languages. In this work, we propose a novel semi-supervised method to learn cross-lingual word embeddings for BLI. Our model is independent of the isomorphic assumption and uses nonlinear mapping in the latent space of two independently trained auto-encoders. Through extensive experiments on fifteen (15) different language pairs (in both directions) comprising resource-rich and low-resource languages from two different datasets, we demonstrate that our method outperforms existing models by a good margin. Ablation studies show the importance of different model components and the necessity of non-linear mapping.},
address = {Punta Cana, Dominican Republic},
author = {Tasnim Mohiuddin and M Saiful Bari and Shafiq Joty},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {2712–-2723},
publisher = {ACL},
series = {EMNLP'20},
title = {{LNMap: Departures from Isomorphic Assumption in Bilingual Lexicon Induction Through Non-Linear Mapping in Latent Space}},
url = {https://www.aclweb.org/anthology/2020.emnlp-main.215/},
year = {2020}
}
@inproceedings{jwala-et-al-emnlp-20,
abstract = {Popular Neural Machine Translation model training uses strategies like backtranslation to improve BLEU scores, requiring large amounts of additional data and training. We introduce a class of conditional generative-discriminative hybrid losses that we use to finetune a trained machine translation model. Through a combination of targeted finetuning objectives and intuitive re-use of the training data the model has failed to adequately learn from, we improve the model performance of both a sentence-level and a simple contextual model without using any additional data. We target the improvement of pronoun translations through our finetuning and evaluate our models on a pronoun benchmark testset. Our sentence-level model shows a 0.5 BLEU improvement on both the WMT14 and the IWSLT13 De-En testsets, while our simple contextual model achieves the best results, improving from 31.81 to 32 BLEU on WMT14 De-En testset, and from 32.10 to 33.13 on the IWSLT13 De-En testset, with corresponding improvements in pronoun translation.},
address = {Punta Cana, Dominican Republic},
author = {Prathyusha Jwalapuram and Shafiq Joty and Youlin Shen},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {2267–2279},
publisher = {ACL},
series = {EMNLP'20},
title = {Pronoun-Targeted Finetuning for NMT with Hybrid Losses},
url = {https://www.aclweb.org/anthology/2020.emnlp-main.177/},
year = {2020}
}
@inproceedings{tan-et-al-arxiv-20,
abstract = {Morphological inflection is a process of word formation where base words are modified to express different grammatical categories such as tense, case, voice, person, or number. World Englishes, such as Colloquial Singapore English (CSE) and African American Vernacular English (AAVE), differ from Standard English dialects in inflection use. Although comprehension by human readers is usually unimpaired by non-standard inflection use, NLP systems are not so robust. We introduce a new Base-Inflection Encoding of English text that is achieved by combining linguistic and statistical techniques. Fine-tuning pre-trained NLP models for downstream tasks under this novel encoding achieves robustness to non-standard inflection use while maintaining performance on Standard English examples. Models using this encoding also generalize better to non-standard dialects without explicit training. We suggest metrics to evaluate tokenizers and extensive model-independent analyses demonstrate the efficacy of the encoding when used together with data-driven subword tokenizers.},
address = {Punta Cana, Dominican Republic},
author = {Samson Tan and Shafiq Joty and Lav R. Varshney and Min-Yen Kan},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {5647–-5663},
publisher = {ACL},
series = {EMNLP'20},
title = {{Mind Your Inflections! Improving NLP for Non-Standard English with Base-Inflection Encoding}},
url = {https://www.aclweb.org/anthology/2020.emnlp-main.455/},
year = {2020}
}
@inproceedings{Weishi-et-al-emnlp-20,
abstract = {While participants in a multi-party multi-turn
conversation simultaneously engage in multiple conversation topics, existing response selection methods are developed mainly focusing on a two-party single-conversation scenario. Hence, the prolongation and transition of conversation topics are ignored by current methods. In this work, we frame response selection as a dynamic topic tracking task to match the topic between the response and relevant conversation context. With this new formulation, we propose a novel multi-task learning framework that supports efficient encoding through large pretrained models with only two utterances at once to perform dynamic topic disentanglement and response selection. We also propose Topic-BERT an essential pretraining step to embed topic information into BERT with self-supervised learning. Experimental results on the DSTC-8 Ubuntu IRC dataset show state-of-the-art results in response selection and topic disentanglement tasks outperforming existing methods by a good margin.},
address = {Punta Cana, Dominican Republic},
author = {Weishi Wang and Shafiq Joty and Steven Hoi},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {6581–-6591},
publisher = {ACL},
series = {EMNLP'20},
title = {Response Selection for Multi-Party Conversations with Dynamic Topic Tracking},
url = {https://www.aclweb.org/anthology/2020.emnlp-main.533/},
year = {2020}
}
@inproceedings{Tao-emnlp-20,
abstract = {Huge amounts of textual conversations occur online everyday, where multiple conversations take place concurrently. Interleaved conversations lead to difficulties in not only following the ongoing discussions but also extracting relevant information from simultaneous messages. Conversation disentanglement aims to separate intermingled messages into detached conversations. However existing disentanglement methods rely mostly on hand-crafted features that are dataset specific, which hinders generalization and adaptability. In this work,
we propose an end-to-end online framework for conversation disentanglement that avoids time-consuming domain-specific feature engineering. We design a novel way to embed the whole utterance that comprises timestamp, speaker and message text, and propose a custom attention mechanism that models disentanglement as a pointing problem while effectively capturing inter-utterance interactions in an end-to-end fashion. We also introduce a joint-learning objective to better capture contextual information. Our experiments on the Ubuntu IRC dataset show that our method achieves state-of-the-art performance in both link and conversation prediction tasks.},
address = {Punta Cana, Dominican Republic},
author = {Tao Yu and Shafiq Joty},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {6321–-6330},
publisher = {ACL},
series = {EMNLP'20},
title = {Online Conversation Disentanglement with Pointer Networks},
url = {https://www.aclweb.org/anthology/2020.emnlp-main.512/},
year = {2020}
}
@inproceedings{yue-et-al-arxiv-20,
abstract = {Visual dialog is a challenging vision-language task, where a dialog agent needs to answer a series of questions through reasoning on the image content and dialog history. Prior work has mostly focused on various attention mechanisms to model such intricate interactions. By contrast, in this work, we propose VD-BERT, a simple yet effective framework of unified vision-dialog Transformer that leverages the pretrained BERT language models for Visual Dialog tasks. The model is unified in that (1) it captures all the interactions between the image and the multi-turn dialog using a single-stream Transformer encoder, and (2) it supports both answer ranking and answer generation seamlessly through the same architecture. More crucially, we adapt BERT for the effective fusion of vision and dialog contents via visually grounded training. Without the need of pretraining on external vision-language data, our model yields new state of the art, achieving the top position in both single-model and ensemble settings (74.54 and 75.35 NDCG scores) on the visual dialog leaderboard.},
address = {Punta Cana, Dominican Republic},
author = {Yue Wang and Shafiq Joty and Michael R. Lyu and Irwin King and Caiming Xiong and Steven Hoi},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {3325–-3338},
publisher = {ACL},
series = {EMNLP'20},
title = {{VD-BERT: A Unified Vision and Dialog Transformer with BERT}},
url = {https://www.aclweb.org/anthology/2020.emnlp-main.269/},
year = {2020}
}
@inproceedings{Gao-et-al-emnlp-20,
abstract = {Document interpretation and dialog understanding are the two major challenges for conversational machine reading. In this work, we propose DISCERN, a discourse-aware entailment reasoning network to strengthen the connection and enhance the understanding for both document and dialog. Specifically, we split the document into clause-like elementary discourse units (EDU) using a pre-trained discourse segmentation model, and we train our model in a weakly-supervised manner to predict whether each EDU is entailed by the user feedback in a conversation. Based on the learned EDU and entailment representations, we either reply to the user our final decision “yes/no/irrelevant” of the initial question, or generate a follow-up question to inquiry more information. Our experiments on the ShARC benchmark (blind, held-out test set) show that DISCERN achieves state-of-the-art results of 78.3% macro-averaged accuracy on decision making and 64.0 BLEU1 on follow-up question generation.},
address = {Punta Cana, Dominican Republic},
author = {Yifan Gao and Chien-Sheng Wu and Jingjing Li and Shafiq Joty and Steven Hoi and Caiming Xiong and Irwin King and Michael Lyu},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {2439–-2449},
publisher = {ACL},
series = {EMNLP'20},
title = {Discern: Discourse-Aware Entailment Reasoning Network for Conversational Machine Reading},
url = {https://www.aclweb.org/anthology/2020.emnlp-main.191/},
year = {2020}
}
@inproceedings{bosheng-et-al-emnlp-20,
abstract = {Data augmentation techniques have been widely used to improve machine learning performance. In this work, we propose a novel method to generate high quality synthetic data for low-resource tagging tasks with language models, where the language model is trained with the linearized labeled sentences. Our method is applicable to both supervised and semi-supervised settings. For the supervised setting, we conduct extensive experiments on named entity recognition (NER), part of speech (POS) and end-to-end target based sentiment analysis (E2E-TBSA) tasks. While for the semi-supervised setting, we evaluate our method on the NER task under the conditions of given unlabeled data only and unlabeled data plus a knowledge base. The results show that our method can consistently outperform the baselines, particularly when the given gold training data are less.},
address = {Punta Cana, Dominican Republic},
author = {Bosheng Ding and Linlin Liu and Lidong Bing and Canasai Kruengkrai and Thien Hai Nguyen and Shafiq Joty and Luo Si and Chunyan Miao},
booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {6045–-6057},
publisher = {ACL},
series = {EMNLP'20},
title = {DAGA: Data Augmentation with a Generation Approach for Low-resource Tagging Tasks},
url = {https://www.aclweb.org/anthology/2020.emnlp-main.488/},
year = {2020}
}
@inproceedings{sustainlp-2020-sustainlp,
address = {Punta Cana, Dominican Republic},
author = {Moosavi, Nafise Sadat and Fan, Angela and Shwartz, Vered and Glavas, Goran and Joty, Shafiq and
Wang, Alex and Wolf, Thomas},
booktitle = {Proceedings of SustaiNLP},
publisher = {ACL},
series = {EMNLP'20 Workshop},
title = {Workshop on Simple and Efficient Natural Language Processing},
url = {https://www.aclweb.org/anthology/2020.sustainlp-1.0},
year = {2020}
}
@inproceedings{Nguyen-et-al-acl-20,
abstract = {We propose a novel constituency parsing model that casts the parsing problem into a series of pointing tasks. Specifically, our model estimates the likelihood of a span being a legitimate tree constituent via the pointing score corresponding to the boundary words of the span. Our parsing model supports efficient top-down decoding and our learning objective is able to enforce structural consistency without resorting to the expensive CKY inference. The experiments on the standard English Penn Treebank parsing task show that our method achieves 92.78 F1 without using pre-trained models, which is higher than all the existing methods with similar time complexity. Using pre-trained BERT, our model achieves 95.48 F1, which is competitive with the state-of-the-art while being faster. Our approach also establishes new state-of-the-art in Basque and Swedish in the SPMRL shared tasks on multilingual constituency parsing.},
address = {Seattle, USA},
author = {Thanh-Tung Nguyen and Xuan-Phi Nguyen and Shafiq Joty and Xiaoli Li},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {3284–-3294},
publisher = {ACL},
series = {ACL'20},
title = {Efficient Constituency Parsing by Pointing},
url = {https://www.aclweb.org/anthology/2020.acl-main.301/},
year = {2020}
}
@inproceedings{Nguyen-et-al-acl2-20,
abstract = {We propose Differentiable Window, a new neural module and general purpose component for dynamic window selection. While universally applicable, we demonstrate a compelling use case of utilizing Differentiable Window to improve standard attention modules by enabling more focused attentions over the input regions. We propose two variants of Differentiable Window, and integrate them within the Transformer architecture in two novel ways. We evaluate our proposed approach on a myriad of NLP tasks, including machine translation, sentiment analysis, subject-verb agreement and language modeling. Our experimental results demonstrate consistent and sizable improvements across all tasks.},
address = {Seattle, USA},
author = {Thanh-Tung Nguyen and Xuan-Phi Nguyen and Shafiq Joty and Xiaoli Li},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {6589–-6599},
publisher = {ACL},
series = {ACL'20},
title = {Differentiable Window for Dynamic Local Attention},
url = {https://www.aclweb.org/anthology/2020.acl-main.589/},
year = {2020}
}
@inproceedings{Gao-et-al-acl-20,
abstract = {Conversational machine reading aims to teach machines to interact with users and answer their questions. It is challenging because machines have to understand the knowledge base text, evaluate and keep track of the user scenario, ask clarification questions, and then make a final decision.
Existing approaches have implicit rule text reasoning processes for decision making and weak abilities for question-related rule extraction. In this paper, we present a new framework of conversational machine reading with a novel Explicit Memory Tracker (EMT) that explicitly tracks whether conditions listed in the rule text have already been satisfied to make a decision. Moreover, our framework generates clarifying questions by adopting a coarse-to-fine reasoning strategy, utilizing sentence-level selection scores to weight token-level distributions. On the ShARC benchmark (blind, held-out test set), EMT achieves new state-of-the-art results of 74.8% micro-averaged decision accuracy and 46.0 BLEU4. We also show that EMT is more interpretable by visualizing the entailment-oriented reasoning process as the conversation flows. Code and models will be released to facilitate research along this line.},
address = {Seattle, USA},
author = {Yifan Gao and Chien-Sheng Wu and Shafiq Joty and Caiming Xiong and Richard Socher and Irwin King and Michael Lyu and Steven Hoi},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {935–-945},
publisher = {ACL},
series = {ACL'20},
title = {EMT: Explicit Memory Tracker with Coarse-to-Fine Reasoning for Conversational Machine Reading},
url = {https://www.aclweb.org/anthology/2020.acl-main.88/},
year = {2020}
}
@inproceedings{Tan-et-al-acl-20,
abstract = {Training on only perfect Standard English cor- pora predisposes pre-trained neural networks to discriminate against minorities from non- standard linguistic backgrounds. We perturb the inflectional morphology of words to craft plausible and semantically similar adversarial examples that expose these biases in popu- lar models, e.g., BERT and Transformer, and show that adversarially finetuning them for a single epoch significantly improves robustness without sacrificing performance on clean data.},
address = {Seattle, USA},
author = {Samson Tan and Shafiq Joty and Min-Yen Kan and Richard Socher},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {2920–-2935},
publisher = {ACL},
series = {ACL'20},
title = {It’s Morphin’ Time! Combating Linguistic Discrimination with Inflectional Perturbations},
url = {https://www.aclweb.org/anthology/2020.acl-main.263/},
year = {2020}
}
@inproceedings{nguyen-et-al-20,
abstract = {Incorporating hierarchical structures like constituency trees has been shown to be effective for various natural language processing (NLP) tasks. However, it is evident that state-of-the-art (SOTA) sequence-based models like the Transformer struggle to encode such structures inherently. On the other hand, dedicated models like the Tree-LSTM, while explicitly modeling hierarchical structures, do not perform as efficiently as the Transformer. In this paper, we attempt to bridge this gap with Hierarchical Accumulation to encode parse tree structures into self-attention at constant time complexity. Our approach outperforms SOTA methods in four IWSLT translation tasks and the WMT'14 English-German task. It also yields improvements over Transformer and Tree-LSTM on three text classification tasks. We further demonstrate that using hierarchical priors can compensate for data shortage, and that our model prefers phrase-level attentions over token-level attentions.},
author = {Xuan-Phi Nguyen and Shafiq Joty and Steven Hoi and Richard Socher},
booktitle = {International Conference on Learning Representations},
series = {ICLR-20},
title = {Tree-Structured Attention with Hierarchical Accumulation},
url = {https://openreview.net/forum?id=HJxK5pEYvr},
year = {2020}
}
@inproceedings{shi-et-al-eccv-20,
abstract = {Change Captioning is a task that aims to describe the difference between images with natural language. Most existing methods treat this problem as a difference judgment without the existence of distractors such as viewpoint changes. However, in practice, viewpoint changes happen often and can overwhelm the real difference to be described. In this paper, we propose a novel visual encoder to explicitly distinguish viewpoint changes from real changes in the change captioning task. Moreover, we further simulate the attention preference of humans and propose a novel reinforcement learning process to fine-tune the attention directly with the language evaluation rewards. Extensive experimental results show that our method outperforms the state-of-the-art approaches by a large margin in both Spot-the-Diff and CLEVR-Change datasets.},
address = {Virtual},
author = {Xiangxi Shi and Xu Yang and Jiuxiang Gu and Shafiq Joty and Jianfei Cai},
booktitle = {European Conference on Computer Vision},
numpages = {10},
series = {ECCV'20},
title = {Finding It at Another Side: A Viewpoint-Adapted Matching Encoder for Change Captioning},
url = {http://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123590562.pdf},
year = {2020}
}
@inproceedings{bari-et-al-aaai-20,
abstract = {Recently, neural methods have achieved state-of-the-art (SOTA) results in Named Entity Recognition (NER) tasks for many languages without the need for manually crafted features. However, these models still require manually annotated training data, which is not available for many languages. In this paper, we propose an unsupervised cross-lingual NER model that can transfer knowledge from one language to another in a completely unsupervised way without relying on any bilingual dictionary or parallel data. Our model achieves this through end-to-end parameter sharing and adapting to the target domain through fine-tuning. Experiments on four different languages demonstrate the effectiveness of our approach, outperforming existing models by a good margin and setting a new SOTA for each language pair.},
address = {New York, USA},
author = {Saiful Bari and Shafiq Joty and Prathyusha Jwalapuram},
booktitle = {Thirty-Fourth AAAI Conference on Artificial Intelligence},
month = {September},
pages = {7415--7423},
publisher = {AAAI},
series = {AAAI'20},
title = {{Zero-Resource Cross-Lingual Named Entity Recognition}},
url = {https://ojs.aaai.org/index.php/AAAI/article/view/6237},
year = {2020}
}
@inproceedings{Yingzhu-et-al-interspeech-20a,
abstract = {Transformer model has made great progress in speech recognition. However, compared with models with iterative computation, transformer model has fixed encoder and decoder depth, thus losing the recurrent inductive bias. Besides, finding the optimal number of layers involves trial-and-error attempts. In this paper, the universal speech transformer is proposed, which to the best of our knowledge, is the first work to use universal transformer for speech recognition. It generalizes the speech transformer with dynamic numbers of encoder/decoder layers, which can relieve the burden of tuning depth related hyperparameters. Universal transformer adds the depth and positional embeddings repeatedly for each layer, which dilutes the acoustic information carried by hidden representation, and it also performs a partial update of hidden vectors between layers, which is less efficient especially on the very deep models. For better use of universal transformer, we modify its processing framework by removing the depth embedding and only adding the positional embedding once at transformer encoder frontend. Furthermore, to update the hidden vectors efficiently, especially on the very deep models, we adopt a full update. Experiments on LibriSpeech, Switchboard and AISHELL-1 datasets show that our model outperforms a baseline by 3.88%-13.7%, and surpasses other model with less computation cost.},
address = {Shanghai, China},
author = {Yingzhu Zhao and Chongjia Ni and Cheung-Chi LEUNG and Shafiq Joty and Eng Siong Chng and Bin Ma},
booktitle = {21st Annual Conference of the International Speech Communication Association},
month = {October},
pages = {5021 -- 5025},
publisher = {IEEE},
series = {Interspeech'20},
title = {Universal Speech Transformer},
url = {https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1716.pdf},
year = {2020}
}
@inproceedings{Yingzhu-et-al-interspeech-20b,
abstract = {End-to-end models have been introduced into automatic speech recognition (ASR) successfully and achieved superior performance compared with conventional hybrid systems, especially with the newly proposed transformer model. However, speaker mismatch between training and test data remains a problem, and speaker adaptation for transformer model can be further improved. In this paper, we propose to conduct speaker aware training for ASR in transformer model. Specifically, we propose to embed speaker knowledge through a persistent memory model into speech transformer encoder at utterance level. The speaker information is represented by a number of static speaker i-vectors, which is concatenated to speech utterance at each encoder self-attention layer. Persistent memory is thus formed by carrying speaker information through the depth of encoder. The speaker knowledge is captured from self-attention between speech and persistent memory vector in encoder. Experiment results on LibriSpeech, Switchboard and AISHELL-1 ASR task show that our proposed model brings relative 4.7%-12.5% word error rate (WER) reductions, and achieves superior results compared with other models with the same objective. Furthermore, our model brings relative 2.1%-8.3% WER reductions compared with the first persistent memory model used in ASR.},
address = {Shanghai, China},
author = {Yingzhu Zhao and Chongjia Ni and Cheung-Chi LEUNG and Shafiq Joty and Eng Siong Chng and Bin Ma},
booktitle = {21st Annual Conference of the International Speech Communication Association},
month = {October},
pages = {1261 - 1265},
publisher = {IEEE},
series = {Interspeech'20},
title = {Speech Transformer with Speaker Aware Persistent Memory},
url = {https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1281.pdf},
year = {2020}
}
@inproceedings{Yingzhu-et-al-interspeech-20c,
abstract = {Transformer, a state-of-the-art neural network architecture, has been used successfully for different sequence-to-sequence transformation tasks. This model architecture disperses the attention distribution over entire input to learn long-term dependencies, which is important for some sequence-to-sequence tasks, such as neural machine translation and text summarization. However, automatic speech recognition (ASR) has a characteristic to have monotonic alignment between text output and speech input. Techniques like Connectionist Temporal Classification (CTC), RNN Transducer (RNN-T) and Recurrent Neural Aligner (RNA) build on top of this monotonic alignment and use local encoded speech representations for corresponding token prediction. In this paper, we present an effective cross attention biasing technique in transformer that takes monotonic alignment between text output and speech input into consideration by making use of cross attention weights. Specifically, a Gaussian mask is applied on cross attention weights to limit the input speech context range locally given alignment information. We further introduce a regularizer for alignment regularization. Experiments on LibriSpeech dataset find that our proposed model can obtain improved output-input alignment for ASR, and yields 14.5%-25.0% relative word error rate (WER) reductions.},
address = {Shanghai, China},
author = {Yingzhu Zhao and Chongjia Ni and Cheung-Chi LEUNG and Shafiq Joty and Eng Siong Chng and Bin Ma},
booktitle = {21st Annual Conference of the International Speech Communication Association},
month = {October},
pages = {5031 - 5035},
publisher = {IEEE},
series = {Interspeech'20},
title = {Cross Attention with Monotonic Alignment for Speech Transformer},
url = {https://www.isca-speech.org/archive/Interspeech_2020/pdfs/1198.pdf},
year = {2020}
}
@article{mohiuddin-joty-cl-19,
abstract = {Cross-lingual word embeddings learned from monolingual embeddings have a crucial role in many downstream tasks, ranging from machine translation to transfer learning. Adversarial training has shown impressive success in learning cross-lingual embeddings and the associated word translation task without any parallel data by mapping monolingual embeddings to a shared space. However, recent work has shown superior performance for non-adversarial methods in more challenging language pairs. In this article, we investigate adversarial autoencoder for unsupervised word translation and propose two novel extensions to it that yield more stable training and improved results. Our method includes regularization terms to enforce cycle consistency and input reconstruction, and puts the target encoders as an adversary against the corresponding discriminator. We use two types of refinement procedures sequentially after obtaining the trained encoders and mappings from the adversarial training, namely, refinement with Procrustes solution and refinement with symmetric re-weighting. Extensive experimentations with European, non-European and low-resource languages from two different datasets show that our method achieves better performance than existing adversarial and non-adversarial approaches and is also competitive with the supervised system. Along with performing comprehensive ablation studies to understand the contribution of different components of our adversarial model, we also conduct a thorough analysis of the refinement procedures to understand their effects.
},
author = {Tasnim Mohiuddin and Shafiq Joty},
journal = {Computational Linguistics (presented at ACL-2020)},
number = {2},
pages = {1 -- 32},
publisher = {MIT Press},
title = {{Unsupervised Word Translation with Adversarial Autoencoder}},
url = {https://www.mitpressjournals.org/doi/abs/10.1162/coli_a_00374},
volume = {46},
year = {2020}
}
@article{Jing-et-al-20,
abstract = {Rumors spread in social media severely jeopardize the credibility of online content. Thus, automatic debunking of rumors is of great importance to keep social media a healthy environment. While facing a dubious claim, people often dispute its truthfulness sporadically in their posts containing various cues, which can form useful evidence with long-distance dependencies. In this work, we propose to learn discriminative features from microblog posts by following their non-sequential propagation structure and generate more powerful representations for identifying rumors. For modeling non-sequential structure, we firstly represent the diffusion of microblog posts with propagation trees, which provide valuable clues on how a claim in the original post is transmitted and developed over time. We then present a bottom-up and a top-down tree-structured models based on Recursive Neural Networks (RvNN) for rumor representation learning and classification, which naturally conform to the message propagation process in microblogs. To enhance the rumor representation learning, we reveal that effective rumor detection is highly related to finding evidential posts, e.g., the posts expressing specific attitude towards the veracity of a claim, as an extension of the previous RvNN-based detection models that treat every post equally. For this reason, we design discriminative attention mechanisms for the RvNN-based models to selectively attend on the subset of evidential posts during the bottom-up/top-down recursive composition. Experimental results on four datasets collected from real-world microblog platforms confirm that 1) our RvNN-based models achieve much better rumor detection and classification performance than state-of-the-art approaches; 2) the attention mechanisms for focusing on evidential posts can further improve the performance of our RvNN-based method; and 3) our approach possesses superior capacity on detecting rumors at very early stage.
},
author = {Jing Ma and Wei Gao and Shafiq Joty and Kam-Fai Wong},
journal = {ACM Transactions on Intelligent Systems and Technology (TIST)},
number = {4:42},
pages = {1--28},
publisher = {ACM},
title = {{An Attention-based Rumor Detection Model with Tree-structured Recursive Neural Networks}},
url = {https://dl.acm.org/doi/pdf/10.1145/3391250},
volume = {11},
year = {2020}
}
@article{Shi-et-al-neuro-20,
abstract = {The explosion of video data on the Internet requires effective and efficient technology to generate captions automatically for people, especially those who are visually im- paired. Despite the great progress of video captioning research, particularly in video feature encoding, the language decoder is still largely based on the prevailing recurrent structure such as LSTM, which tends to prefer frequent words that align with the video and do not generalize well to new videos. In this paper, we propose a boundary-aware hierarchical language decoder for video captioning, which consists of a high-level de- coder, working as a global (caption-level) language model, and a low-level decoder, working as a local (phrase-level) language model. Most importantly, we introduce a binary gate into the low-level language decoder to detect the phrasal boundaries. To- gether with other advanced components including a joint video prediction module, a shared soft attention, and a boundary-aware video encoding module, our integrated video captioning framework can discover hierarchical language information and dis- tinguish the subjects from the objects of the verbs in a sentence, which are usually confusing during caption generation. Extensive experiments on two widely-used video captioning datasets, MSR-Video-to-Text (MSR-VTT) and YouTube-to-Text (MSVD), show that our method is highly competitive, compared with the state-of-the-art methods.},
author = {Xiangxi Shi and Jianfei Cai and Jiuxiang Gu and Shafiq Joty},
doi = {},
journal = {Neurocomputing},
number = {},
pages = {347-356},
publisher = {Elsevier},
title = {{Video Captioning with Boundary-Aware Hierarchical Language Decoding and Joint Video Prediction}},
url = {https://www.sciencedirect.com/science/article/abs/pii/S0925231220313023},
volume = {},
year = {2020}
}
@article{Car-et-al-jmir-20,
abstract = {Background: Conversational agents, also known as chatbots, are computer programs designed to simulate human text or verbal conversations. They are increasingly used in a range of fields, including health care. By enabling better accessibility, personalization, and efficiency, conversational agents have the potential to improve patient care.
Objective: This study aimed to review the current applications, gaps, and challenges in the literature on conversational agents in health care and provide recommendations for their future research, design, and application.
Methods: We performed a scoping review. A broad literature search was performed in MEDLINE (Medical Literature Analysis and Retrieval System Online; Ovid), EMBASE (Excerpta Medica database; Ovid), PubMed, Scopus, and Cochrane Central with the search terms “conversational agents,” “conversational AI,” “chatbots,” and associated synonyms. We also searched the gray literature using sources such as the OCLC (Online Computer Library Center) WorldCat database and ResearchGate in April 2019. Reference lists of relevant articles were checked for further articles. Screening and data extraction were performed in parallel by 2 reviewers. The included evidence was analyzed narratively by employing the principles of thematic analysis.
Results: The literature search yielded 47 study reports (45 articles and 2 ongoing clinical trials) that matched the inclusion criteria. The identified conversational agents were largely delivered via smartphone apps (n=23) and used free text only as the main input (n=19) and output (n=30) modality. Case studies describing chatbot development (n=18) were the most prevalent, and only 11 randomized controlled trials were identified. The 3 most commonly reported conversational agent applications in the literature were treatment and monitoring, health care service support, and patient education.
Conclusions: The literature on conversational agents in health care is largely descriptive and aimed at treatment and monitoring and health service support. It mostly reports on text-based, artificial intelligence–driven, and smartphone app–delivered conversational agents. There is an urgent need for a robust evaluation of diverse health care conversational agents’ formats, focusing on their acceptability, safety, and effectiveness.},
author = {Lorainne Car and Dhakshenya Dhinagaran and Bhone Kyaw and Tobias Kowatsch and Shafiq Joty and Yin Theng and Rifat Atun},
doi = {doi:10.2196/17158},
journal = {Journal of Medical Internet Research (JMIR)},
month = {Aug},
number = {8},
pmid = {},
title = {{Conversational agents in healthcare: a scoping review and conceptual analysis}},
url = {https://www.jmir.org/2020/8/e17158/},
volume = {22},
year = {2020}
}
@article{simeng-et-al-arxiv-19,
abstract = {Submodularity is a desirable property for a variety of objectives in content selection where the current neural encoder-decoder framework is deficient. We propose diminishing attentions, a class of novel attention mechanisms that exploit the properties of submodular functions. The resulting attention module offers an architecturally simple yet empirically effective method to improve the coverage of neural text generation. We run on three directed text generation tasks with different levels of recovering rate, across two modalities, three neural model architectures and two training strategy variations. The results and analyses demonstrate that our method generalizes well across these settings, produces texts of good quality, outperforms comparable baselines and achieves state-of-the-art performance.},
author = {Simeng Han and Xiang Lin and Shafiq Joty},
issue = {},
journal = {arXiv (* not peer reviewed)},
pages = {},
publisher = {arXiv.org},
title = {{Resurrecting Submodularity for Neural Text Generation}},
url = {https://arxiv.org/abs/1911.03014},
year = {2020}
}
@article{jwala-et-al-arxiv-20,
abstract = {Despite increasing instances of machine translation (MT) systems including contextual information, the evidence for translation quality improvement is sparse, especially for discourse phenomena. Popular metrics like BLEU are not expressive or sensitive enough to capture quality improvements or drops that are minor in size but significant in perception. We introduce the first of their kind MT benchmark datasets that aim to track and hail improvements across four main discourse phenomena: anaphora, lexical consistency, coherence and readability, and discourse connective translation. We also introduce evaluation methods for these tasks, and evaluate several baseline MT systems on the curated datasets. Surprisingly, we find that existing context-aware models do not improve discourse-related translations consistently across languages and phenomena.},
author = {Prathyusha Jwalapuram and Barbara Rychalska and Shafiq Joty and Dominika Basaj},
issue = {},
journal = {arXiv (* not peer reviewed)},
pages = {},
publisher = {arXiv.org},
title = {{Can Your Context-Aware MT System Pass the DiP Benchmark Tests? : Evaluation Benchmarks for Discourse Phenomena in Machine Translation}},
url = {https://arxiv.org/abs/2004.14607},
year = {2020}
}
@inproceedings{linlin-et-al-emnlp-19,
abstract = {Transition-based top-down parsing with pointer networks have achieved state-of-the-art results in multiple parsing tasks, while having a linear time complexity. However, the decoder of these parsers has a sequential structure, which does not yield the most appropriate inductive bias for deriving tree structures. In this paper, we propose hierarchical pointer network parsers, and apply them to dependency and discourse parsing tasks. Our results on standard benchmark datasets demonstrate the effectiveness of our approach, outperforming existing methods and setting a new state-of-the-art in both parsing tasks.},
address = {Hong Kong},
author = {Linlin Liu* and Xiang Lin* and Shafiq Joty and Simeng Han and Lidong Bing},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {1007–-1017},
publisher = {ACL},
series = {EMNLP'19},
title = {Hierarchical Pointer Net Parsing},
url = {https://www.aclweb.org/anthology/D19-1093.pdf},
year = {2019}
}
@inproceedings{moon-et-al-emnlp-19,
abstract = {Recently, neural approaches to coherence modeling have achieved state-of-the-art results in several evaluation tasks. However, we show that most of these models often fail on harder tasks with more realistic application scenarios. In particular, the existing models underperform on tasks that require the model to be sensitive to local contexts such as candidate ranking in conversational dialogue and in machine translation. In this paper, we propose a unified coherence model that incorporates sentence grammar, inter-sentence coherence relations, and global coherence patterns into a common neural framework. With extensive experiments on local and global discrimination tasks, we demonstrate that our proposed model outperforms existing models by a good margin, and establish a new state-of-the-art.},
address = {Hong Kong},
author = {Han-Cheol Moon* and Tasnim Mohiuddin* and Shafiq Joty* and Chi Xu},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {2262–2272},
publisher = {ACL},
series = {EMNLP'19},
title = {A Unified Neural Coherence Model},
url = {https://www.aclweb.org/anthology/D19-1231/},
year = {2019}
}
@inproceedings{jwala-et-al-emnlp-19,
abstract = {The ongoing neural revolution in machine translation has made it easier to model larger contexts beyond the sentence-level, which can potentially help resolve some discourse-level ambiguities such as pronominal anaphora, thus enabling better translations. Unfortunately, even when the resulting improvements are seen as substantial by humans, they remain Punta Cana, Dominican Republicly unnoticed by traditional automatic evaluation measures like BLEU, as only a few words end up being affected. Thus, specialized evaluation measures are needed. With this aim in mind, we contribute an extensive, targeted dataset that can be used as a test suite for pronoun translation, covering multiple source languages and different pronoun errors drawn from real system translations, for English. We further propose an evaluation measure to differentiate good and bad pronoun translations. We also conduct a user study to report correlations with human judgments.},
address = {Hong Kong},
author = {Prathyusha Jwalapuram and Shafiq Joty and Irina Temnikova and Preslav Nakov},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing},
numpages = {9},
pages = {2964–2975},
publisher = {ACL},
series = {EMNLP'19},
title = {Evaluating Pronominal Anaphora in Machine Translation: An Evaluation Measure and a Test Suite},
url = {https://www.aclweb.org/anthology/D19-1294/},
year = {2019}
}
@inproceedings{khadanga-et-al-emnlp-19,
abstract = {Monitoring patients in ICU is a challenging and high-cost task. Hence, predicting the condition of patients during their ICU stay can help provide better acute care and plan the hospital's resources. There has been continuous progress in machine learning research for ICU management, and most of this work has focused on using time series signals recorded by ICU instruments. In our work, we show that adding clinical notes as another modality improves the performance of the model for three benchmark tasks: in-hospital mortality prediction, modeling decompensation, and length of stay forecasting that play an important role in ICU management. While the time-series data is measured at regular intervals, doctor notes are charted at irregular times, making it challenging to model them together. We propose a method to model them jointly, achieving considerable improvement across benchmark tasks over baseline time-series model.},
address = {Hong Kong},
author = {Swaraj Khadanga and Karan Aggarwal and Shafiq Joty and Jaideep Srivastava},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing},
numpages = {5},
pages = {6432–6437},
publisher = {ACL},
series = {EMNLP'19},
title = {Using Clinical Notes with Multimodal Learning for ICU Management},
url = {https://www.aclweb.org/anthology/D19-1678.pdf},
year = {2019}
}
@inproceedings{lin-et-al-acl-19,
abstract = {We propose an efficient neural framework for sentence-level discourse analysis in accordance with Rhetorical Structure Theory (RST). Our framework comprises a discourse segmenter to identify the elementary discourse units (EDU) in a text, and a discourse parser that constructs a discourse tree in a top-down fashion. Both the segmenter and the parser are based on Pointer Networks and operate in linear time. Our segmenter yields an F1 score of 95.4, and our parser achieves an F1 score of 81.7 on the aggregated labeled (relation) metric, surpassing previous approaches by a good margin and approaching human agreement on both tasks (98.3 and 83.0 F1)},
address = {Florence, Italy},
author = {Xiang Lin* and Shafiq Joty* and Prathyusha Jwalapuram and Saiful Bari},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {4190–4200},
publisher = {ACL},
series = {ACL'19},
title = {A Unified Linear-Time Framework for Sentence-Level Discourse Parsing},
url = {https://www.aclweb.org/anthology/P19-1410/},
year = {2019}
}
@inproceedings{jing-et-al-acl-19,
abstract = {Claim verification is generally a task of verifying the veracity of a given claim, which is critical to many downstream applications. It is cumbersome and inefficient for human fact-checkers to find consistent evidences, from which solid verdict could be inferred against the claim. In this paper, we propose a novel end-to-end hierarchical attention network focusing on learning to represent coherent evidences as well as their semantic relatedness with the claim. Our model consists of three main components: 1) A coherence-based attention layer embeds coherent evidences considering the claim and sentences from relevant articles; 2) An entailment-based attention layer attends on sentences that can semantically infer the claim on top of the first attention; and 3) An output layer predicts the verdict based on the the embedded evidences.
Experimental results on three public benchmark datasets show that our proposed model outperforms a set of state-of-the-art baselines.},
address = {Florence, Italy},
author = {Jing Ma and Wei Gao and Shafiq Joty and Kam-Fai Wong},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {2561–2571},
publisher = {ACL},
series = {ACL'19},
title = {Sentence-Level Evidence Embedding for End-to-End Claim Verification with Hierarchical Attention Networks},
url = {https://www.aclweb.org/anthology/P19-1244/},
year = {2019}
}
@inproceedings{joty-et-al-acl-19,
abstract = {Discourse processing is a suite of Natural Language Processing (NLP) tasks to uncover linguistic structures from texts at several levels, which can support many downstream applications. This involves identifying the topic structure, the coherence structure, the coreference structure, and the conversation structure for conversational discourse. Taken together, these structures can inform text summarization, machine translation, essay scoring, sentiment analysis, information extraction, question answering, and thread recovery. The tutorial starts with an overview of basic concepts in discourse analysis -- monologue vs. conversation, synchronous vs. asynchronous conversation, and key linguistic structures in discourse analysis. We also give an overview of linguistic structures and corresponding discourse analysis tasks that discourse researchers are generally interested in, as well as key applications on which these discourse structures have an impact.},
address = {Florence, Italy},
author = {Shafiq Joty* and Giuseppe Carenini* and Raymond Ng and Gabriel Murray},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics: Tutorial Abstracts},
pages = {1--6},
series = {ACL'19},
title = {Discourse Processing and Its Applications},
url = {papers/joty-et-al-acl-19.pdf},
year = {2019}
}
@inproceedings{gu-et-al-iccv-19,
abstract = {Deep neural networks have achieved great success on the image captioning task. However, most of the existing models depend heavily on paired image-sentence datasets, which are very expensive to acquire in most real-world scenarios. In this paper, we propose a scene graph based approach for unpaired image captioning. Our method merely requires an image set, a sentence corpus, an image scene graph generator, and a sentence scene graph generator. The sentence corpus is used to teach the decoder how to generate meaningful sentences from a scene graph. To further encourage the generated captions to be semantically consistent with the image, we employ adversarial learning to align the visual scene graph to the textual scene graph. Experimental results show that our proposed model can generate quite promising results without using any image-caption training pairs, outperforming existing methods by a wide margin.},
address = {Seoul, Korea},
author = {Jiuxiang Gu and Shafiq Joty and Jianfei Cai and Handong Zhao and Xu Yang and Gang Wang},
booktitle = {Proceedings of the International Conference on Computer Vision},
pages = {10323-10332},
publisher = {IEEE},
series = {ICCV'19},
title = {Unpaired Image Captioning via Scene Graph Alignments},
url = {https://openaccess.thecvf.com/content_ICCV_2019/html/Gu_Unpaired_Image_Captioning_via_Scene_Graph_Alignments_ICCV_2019_paper.html},
year = {2019}
}
@inproceedings{shi-et-al-acmmm-19,
abstract = {With the rapid growth of video data and the increasing demands of various applications such as intelligent video search and assistance toward visually-impaired people, video captioning task has received a lot of attention recently in computer vision and natural language processing fields. The state-of-the-art video captioning methods focus more on encoding the temporal information, while lack of effective ways to remove irrelevant temporal information and also neglecting the spatial details. However, the current RNN encoding module in single time order can be influenced by the irrelevant temporal information, especially the irrelevant temporal information is at the beginning of the encoding. In addition, neglecting spatial information will lead to the relationship confusion of the words and detailed loss. Therefore, in this paper, we propose a novel recurrent video encoding method and a novel visual spatial feature for the video captioning task. The recurrent encoding module encodes the video twice with the predicted key frame to avoid the irrelevant temporal information often occurring at the beginning and the end of a video. The novel spatial features represent the spatial information in different regions of a video and enrich the details of a caption. Experiments on two benchmark datasets show superior performance of the proposed method.},
address = {Nice, France},
author = {Xiangxi Shi and Jianfei Cai and Shafiq Joty and Jiuxiang Gu},
booktitle = {Proceedings of the 27th ACM International Conference on Multimedia},
numpages = {10},
pages = {818–826},
publisher = {ACM},
series = {ACMMM'19},
title = {Watch It Twice: Video Captioning with a Refocused Video Encoder},
url = {https://dl.acm.org/doi/10.1145/3343031.3351060},
year = {2019}
}
@inproceedings{mohiuddin-joty-naacl-19,
abstract = {Adversarial training has shown impressive
success in learning bilingual dictionary without any parallel data by mapping monolingual embeddings to a shared space. However, recent work has shown superior performance for non-adversarial methods in more challenging language pairs. In this work, we revisit adversarial autoencoder for unsupervised word translation and propose two novel extensions to it that yield more stable training and improved results. Our method includes regularization terms to enforce cycle consistency and input reconstruction, and puts the target encoders as an adversary against the corresponding discriminator. Extensive experimentations with European and non-European languages show that our method achieves better performance than recently proposed adversarial and non-adversarial approaches and is also competitive with the supervised system.},
address = {Minneapolis, USA},
author = {Tasnim Mohiuddin and Shafiq Joty},
booktitle = {Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
numpages = {9},
pages = {3857–3867},
publisher = {ACL},
series = {NAACL'19},
title = {Revisiting Adversarial Autoencoder for Unsupervised Word Translation with Cycle Consistency and Improved Training},
url = {https://www.aclweb.org/anthology/N19-1386/},
year = {2019}
}
@inproceedings{joty-mohiuddin-nguyen-naacl-19,
abstract = {We address the problem of speech act recognition
(SAR) in asynchronous conversations (e.g., forums, emails). However, unlike synchronous conversations (e.g., meetings, phone), asynchronous domains lack large labeled datasets to train an effective SAR model. In this paper, we propose methods to effectively leverage abundant unlabeled conversational data and the available labeled data from synchronous domains. We carry out our research in three main steps. First, we introduce a neural architecture based on hierarchical LSTMs and conditional random fields (CRF) for SAR in asynchronous conversations, and show that our method outperforms existing methods when trained on in-domain data only. Second, we improve our initial SAR models by semi-supervised learning in the form of pretrained word embeddings learned from a large unlabeled conversational corpus. Finally, we employ adversarial training to improve the results further by leveraging the labeled data from synchronous domains and by explicitly modeling the shift in two domains.},
address = {Minneapolis, USA},
author = {Tasnim Mohiuddin* and Thanh-Tung Nguyen* and Shafiq Joty*},
booktitle = {Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
numpages = {9},
pages = {1326–1336},
publisher = {ACL},
series = {NAACL'19},
title = {Adaptation of Hierarchical Structured Models for Speech Act Recognition in Asynchronous Conversation},
url = {https://www.aclweb.org/anthology/N19-1134/},
year = {2019}
}
@inproceedings{karan-et-al-aaai-19,
abstract = {Sufficient physical activity and restful sleep play a major role in the prevention and cure of many chronic conditions. Being able to proactively screen and monitor such chronic conditions would be a big step forward for overall health. The rapid increase in the popularity of wearable devices provides a significant new source, making it possible to track the user’s lifestyle real-time. In this paper, we propose a novel unsupervised representation learning technique called activ- ity2vec that learns and “summarizes” the discrete-valued ac- tivity time-series. It learns the representations with three com- ponents: (i) the co-occurrence and magnitude of the activity levels in a time-segment, (ii) neighboring context of the time- segment, and (iii) promoting subject-invariance with adver- sarial training. We evaluate our method on four disorder pre- diction tasks using linear classifiers. Empirical evaluation and analysis demonstrate that our proposed method performs bet- ter than many strong baselines, and adversarial learning helps improve the generalizability of our representations by pro- moting subject invariant features. We also show that using the representations at the level of a day works the best since human activity is structured in terms of daily routines.},
address = {Honolulu, Hawaii},
author = {Karan Aggarwal and Shafiq Joty and Luis Fernandez-Luque and Jaideep Srivastava},
booktitle = {Thirty-Third AAAI Conference on Artificial Intelligence},
month = {September},
pages = {834 -- 841},
publisher = {AAAI},
series = {AAAI'19},
title = {Adversarial Unsupervised Representation Learning for Activity Time-Series},
url = {https://arxiv.org/abs/1811.06847},
year = {2019}
}
@inproceedings{sameer-et-al-icassp-19,
abstract = {We present the Factorial Deep Markov Model (FDMM) for
representation learning of speech. The FDMM learns disentangled, interpretable and lower dimensional latent representations from speech without supervision. We use a static and dynamic latent variable to exploit the fact that information in a speech signal evolves at different time scales. Latent representations learned by the FDMM outperform a baseline ivector system on speaker verification and dialect identification while also reducing the error rate of a phone recognition system in a domain mismatch scenario.},
address = {Brighton, UK},
author = {Sameer Khurana and Shafiq Joty and Ahmed Ali and James Glass},
booktitle = {International Conference on Acoustics, Speech, and Signal Processing},
month = {September},
pages = {6540 -- 6544},
publisher = {IEEE},
series = {ICASSP'19},
title = {A Fatorial Deep Markov Model For Unsupervised Disentangled Representation Learning From Speech},
url = {https://groups.csail.mit.edu/sls/publications/2019/SameerKhurana_ICASSP-2019.pdf},
year = {2019}
}
@inproceedings{liu-et-al-sigmod-demo-19,
abstract = {A core component of a database systems course at the undergraduate
level is the design and implementation of the query optimizer in a rdbms. The query optimization process produces a query execution plan (qep) which represents an execution strategy for a sql query. Unfortunately, in practice, it is often difficult for a student to comprehend the query execution strategy by perusing the qep, hindering her learning process. In this demonstration, we present a novel system called neuron that facilitates natural language interaction with qeps to enhance its understanding. neuron accepts a sql query (which may include joins, aggregation, nesting, among other things) as input, executes it, and generates a simplified natural language-based description (both in text and voice form) of the execution strategy deployed by the underlying rdbms. Furthermore, it facilitates understanding of various features related to the qep through a natural language-based question answering framework.We advocate that such tool, world’s first of its kind, can greatly enhance students’ learning of the query optimization topic.},
address = {Amsterdam, The Netherlands.},
author = {Siyuan Liu and Sourav S Bhowmick and Wanlu Zhang and Shu Wang and Wanyi Huang and Shafiq Joty},
booktitle = {Proceedings of 45th ACM SIGMOD International Conference on Management of Data (Demo)},
month = {July},
pages = {1953–1956},
publisher = {ACM},
series = {SIGMOD'19 (Demo)},
title = {NEURON: Query Execution Plan Meets Natural Language Processing For Augmenting DB Education},
url = {papers/liu-et-al-sigmod-demo-19.pdf},
year = {2019}
}
@article{nguyen-joty-iclr-19,
abstract = {Most state-of-the-art neural machine translation systems, despite being different in architectural skeletons (e.g. recurrence, convolutional), share an indispensable feature: the Attention. However, most existing attention methods are token-based and ignore the importance of phrasal alignments, the key ingredient for the success of phrase-based statistical machine translation. In this paper, we propose novel phrase-based attention methods to model n-grams of tokens as attention entities. We incorporate our phrase-based attentions into the recently proposed Transformer network, and demonstrate that our approach yields improvements of 1.3 BLEU for English-to-German and 0.5 BLEU for German-to-English translation tasks on WMT newstest2014 using WMT'16 training data.},
author = {Phi Xuan Nguyen and Shafiq Joty},
issue = {},
journal = {arXiv (* not peer reviewed)},
pages = {},
publisher = {arXiv.org},
title = {{Phrase-Based Attentions}},
url = {https://arxiv.org/abs/1810.03444},
year = {2018}
}
@article{joty-cl-si-18,
abstract = {Participants in asynchronous conversations (e.g., forums, emails) interact with each other at
different times, performing certain communicative acts, called speech acts (e.g., question, request). In this article, we propose a hybrid approach to speech act recognition in asynchronous conversations. Our approach works in two main steps: a long short-term memory recurrent neural network (LSTM-RNN) first encodes each sentence separately into a task-specific distributed representation, which is then used in a conditional random field (CRF) model to capture the conversational dependencies between sentences. The LSTM-RNN uses pretrained word embeddings learned from a large conversational corpus and is trained to classify sentences into speech act types. The CRF model can consider arbitrary graph structures to model conversational dependencies in an asynchronous conversation. In addition, to mitigate the problem of limited annotated data in the asynchronous domains, we adapt the LSTM-RNN model to learn from synchronous conversations (e.g., meetings) using domain adversarial training of neural networks. Empirical evaluation shows the effectiveness of our approach over existing ones: (i) LSTM-RNNs provide better task-specific representations, (ii) conversational word embeddings benefit the LSTM-RNNs more than the off-the-shelf ones, (iii) adversarial training gives better domain-invariant representations, and (iv) the global CRF model improves over local models.},
author = {Shafiq Joty and Tasnim Mohiuddin},
journal = {Computational Linguistics (Special Issue on Language in Social Media, Exploiting discourse and other contextual information)},
number = {4},
pages = {859 -- 894},
publisher = {MIT Press},
title = {{Speech Act Modeling of Written Asynchronous Conversations: A Neural CRF Approach}},
url = {https://www.mitpressjournals.org/doi/pdf/10.1162/coli_a_00339},
volume = {44},
year = {2018}
}
@inproceedings{joty-et-al-emnlp-18,
abstract = {We address, for the first time, the joint resolution
of two important Question Answering tasks on community forums: given a new question, (i) find related existing questions, and (ii) find relevant answers to the new question. We further use an auxiliary task to complement the previous two, i.e., (iii) find good answers with respect to the thread question in a question-comment thread. We use deep neural networks (DNNs) to learn meaningful task-specific embeddings, which we then incorporate into a conditional random field (CRF) model on the multitask problem, performing joint learning over arbitrary graph structures. The experimental results show that DNNs alone achieve competitive results when trained to produce the embeddings. Furthermore, the CRF model is able to effectively make use of the embeddings and the dependencies between the tasks to improve results significantly and consistently across a variety of evaluation metrics, thus showing the complementarity of DNNs and structured learning.},
address = {Brussels, Belgium},
author = {Joty, Shafiq and M\`{a}rquez, Llu\'{i}s and Nakov, Preslav},
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
pages = {4196 -- 4207},
series = {EMNLP'18},
title = {Joint Multitask Learning for Community Question Answering Using Task-Specific Embeddings},
url = {http://aclweb.org/anthology/D18-1452},
year = {2018}
}
@inproceedings{joty-mohiuddin-nguyen-acl-18,
abstract = {We propose a novel coherence model for written asynchronous conversations (e.g., forums, emails), and show its applications in coherence assessment and thread reconstruction tasks. We conduct our research in two steps. First, we propose improvements to the recently proposed neural entity grid model by lexicalizing its entity transitions. Then, we extend the model to asynchronous conversations by incorporating the underlying conversational structure in the entity grid representation and feature computation. Our model achieves state of the art results on standard coherence assessment tasks in monologue and conversations outperforming existing models. We also demonstrate its effectiveness in reconstructing thread structures.},
address = {Melbourne, Australia},
author = {Shafiq Joty* and Tasnim Mohiuddin* and Dat Nguyen*},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics},
pages = {558–-568},
publisher = {Association for Computational Linguistics},
series = {ACL'18},
title = {{Coherence Modeling of Asynchronous Conversations: A Neural Entity Grid Approach}},
url = {http://aclweb.org/anthology/P18-1052},
year = {2018}
}
@inproceedings{firoj-et-al-acl-18,
abstract = {In recent years there has been a growing interest in deep neural networks (DNN)
and representation learning with applications to a myriad of NLP and data mining problems. The success of DNNs is heavily dependent on the availability of labeled data. However, obtaining labeled data is a big challenge in many real-world problems. In such cases, a DNN model can leverage labeled and unlabeled data from a related domain, but it has to deal with the shift in data distributions between the domains. In this paper, we study the problem of classifying social media posts during a crisis event (e.g., Earthquake). For that, we use labeled and unlabeled data from past similar events (e.g., Flood) and unlabeled data for the current event. We propose a novel model that performs adversarial learning based domain adaptation to deal with distribution drifts and graph based semi-supervised learning to leverage unlabeled data within a single unified deep learning framework. Our experiments with two real-world crisis datasets collected from Twitter demonstrate significant improvements over several baselines.},
address = {Melbourne, Australia},
author = {Firoj Alam and Shafiq Joty and Muhammad Imran},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics},
pages = {1077–-1087},
publisher = {Association for Computational Linguistics},
series = {ACL'18},
title = {{Domain Adaptation with Adversarial Training and Graph Embeddings}},
url = {http://aclweb.org/anthology/P18-1099},
year = {2018}
}
@inproceedings{gu-et-al-eccv-18,
abstract = {Image captioning is a multimodal task involving computer vision and natural language processing, where the goal is to learn a mapping from the image to its natural language description. In general, the mapping function is learned from a training set of image-caption pairs. However, for some language, large scale image-caption paired corpus might not be available. We present an approach to this unpaired image captioning problem by language pivoting. Our method can effectively capture the characteristics of an image captioner from the pivot language (Chinese) and align it to the target language (English) using another pivot-target (Chinese-English) parallel corpus. We evaluate our method on two image-to-English benchmark datasets: MSCOCO and Flickr30K. Quantitative comparisons against several baseline approaches demonstrate the effectiveness of our method.},
address = {Munich, Germany},
author = {Jiuxiang Gu and Shafiq Joty and Jianfei Cai and Gang Wang},
booktitle = {European Conference on Computer Vision},
pages = {xx--xx},
publisher = {Springer},
series = {ECCV'18},
title = {Unpaired Image Captioning by Language Pivoting},
url = {https://arxiv.org/abs/1803.05526},
year = {2018}
}
@inproceedings{qing-et-al-eccv-18,
abstract = {Most existing works in visual question answering (VQA) are dedicated to improving the accuracy of predicted answers, while disregarding the explanations. We argue that the explanation for an answer is of the same or even more importance compared with the answer itself, since it makes the question and answering process more understandable and traceable. To this end, we propose a new task of VQA-E (VQA with Explanation), where the computational models are required to generate an explanation with the predicted answer. We first construct a new dataset, and then frame the VQA-E problem in a multi-task learning architecture. Our VQA-E dataset is automatically derived from the VQA v2 dataset by intelligently exploiting the available captions. We have conducted a user study to validate the quality of explanations synthesized by our method. We quantitatively show that the additional supervision from explanations can not only produce insightful textual sentences to justify the answers, but also improve the performance of answer prediction. Our model outperforms the state-of-the-art methods by a clear margin on the VQA v2 dataset.},
address = {Munich, Germany},
author = {Qing Li and Qingyi Tao and Shafiq Joty and Jianfei Cai and Jiebo Luo},
booktitle = {European Conference on Computer Vision},
pages = {xx--xx},
publisher = {Springer},
series = {ECCV'18},
title = {VQA-E: Explaining, Elaborating, and Enhancing Your Answers for Visual Questions},
url = {https://arxiv.org/abs/1803.07464},
year = {2018}
}
@inproceedings{gu-et-al-cvpr-18,
abstract = {Textual-visual cross-modal retrieval has been a hot research topic in both
computer vision and natural language processing communities. Learning appropriate representations for multi-modal data is crucial for the cross-modal retrieval performance. Unlike existing image-text retrieval approaches that embed image-text pairs as single feature vectors in a common representational space, we propose to incorporate generative processes into the cross-modal feature embedding, through which we are able to learn not only the global abstract features but also the local grounded features. Extensive experiments show that our framework can well match images and sentences with complex content, and achieve the state-of-the-art cross-modal retrieval results on MSCOCO dataset.},
address = {Salt Lake City, UTAH, USA},
author = {Jiuxiang Gu and Jianfei Cai and Shafiq Joty and Li Niu and Gang Wang},
booktitle = {Computer Vision and Pattern Recognition},
pages = {xx--xx},
publisher = {IEEE},
series = {CVPR'18, Spotlight},
title = {Look, Imagine and Match: Improving Textual-Visual Cross-Modal Retrieval with Generative Models},
url = {https://arxiv.org/abs/1711.06420},
year = {2018}
}
@inproceedings{chin-et-al-cikm-18,
abstract = {Textual reviews, which are readily available on many e-commerce and review websites such as Amazon and Yelp, serve as an invaluable source of information for recommender systems. However, not all parts of the reviews are equally important, and the same choice of words may reflect a different meaning based on its context. In this paper, we propose a novel end-to-end Aspect-based Neural Recommender (ANR) to perform aspect-based representation learning for both users and items via an attention-based component. Furthermore, we model the multi-faceted process behind how users rate items by estimating the aspect-level user and item importance by adapting the neural co-attention mechanism. Our proposed model concurrently address several shortcomings of existing recommender systems, and a thorough experimental study on 25 benchmark datasets from Amazon and Yelp shows that ANR significantly outperforms recently proposed state-of-the-art baselines such as DeepCoNN, D-Attn and ALFM.},
address = {Turin, Italy},
author = {Chin Yao and Kaiqi Zhao and Shafiq Joty and Gao Cong},
booktitle = {Proceedings of the 27th ACM International Conference on Information and Knowledge Management},
month = {October},
pages = {147 -- 156},
publisher = {ACM},
series = {CIKM'18},
title = {Aspect-based Neural Recommender},
url = {https://dl.acm.org/citation.cfm?id=3271810},
year = {2018}
}
@inproceedings{li-sun-joty-ijcai-18,
abstract = {Text segmentation is a fundamental task in natural
language processing. Depending on the levels of granularity, the task can be defined as segmenting a document into topical segments, or segmenting a sentence into elementary discourse units. Traditional solutions to the two tasks heavily rely on carefully designed features. The recently proposed neural models do not need manual feature engineering, but they either suffer from sparse boundary tags or they cannot well handle the issue of variable size output vocabulary. Our generic end-to-end segmentation model, named SEGBOT, uses a bidirectional recurrent neural network to encode input text sequence. The model then uses another recurrent neural network together with a pointer network to select text boundaries in the input sequence. In this way, SEGBOT does not require hand-crafted features. More importantly, our model inherently handles the issue of variable size output vocabulary and the issue of sparse boundary tags. In our experiments, SEGBOT outperforms state-of-the-art models on two tasks, document-level topic segmentation and sentence-level discourse segmentation.},
address = {Stockholm, Sweden},
author = {Jing Li and Aixin Sun and Shafiq Joty},
booktitle = {Proceedings of the 27th International Joint Conference on Artificial Intelligence and the 23rd European Conference on Artificial Intelligence},
month = {July},
pages = {4166 -- 4172},
publisher = {},
series = {IJCAI-ECAI-2018},
title = {SegBot: A Generic Neural Text Segmentation Model with Pointer Network},
url = {https://www.ijcai.org/proceedings/2018/0579.pdf},
year = {2018}
}
@inproceedings{karan-et-al-ieee-big-data-18,
abstract = {Sleep plays a vital role in human health, both mental and physical. Sleep disorders like sleep apnea are increasing in prevalence, with the rapid increase in factors like obesity. Sleep apnea is most commonly treated with Continuous Positive Air Pressure (CPAP) therapy. Presently, however, there is no mechanism to monitor a patient's progress with CPAP. Accurate detection of sleep stages from CPAP flow signal is crucial for such a mechanism. We propose, for the first time, an automated sleep staging model based only on the flow signal. Deep neural networks have recently shown high accuracy on sleep staging by eliminating handcrafted features. However, these methods focus exclusively on extracting informative features from the input signal, without paying much attention to the dynamics of sleep stages in the output sequence. We propose an end-to-end framework that uses a combination of deep convolution and recurrent neural networks to extract high-level features from raw flow signal with a structured output layer based on a conditional random field to model the temporal transition structure of the sleep stages. We improve upon the previous methods by 10\%
using our model, that can be augmented to the previous sleep staging deep learning methods. We also show that our method can be used to accurately track sleep metrics like sleep efficiency calculated from sleep stages that can be deployed for monitoring the response of CPAP therapy on sleep apnea patients. Apart from the technical contributions, we expect this study to motivate new research questions in sleep science.},
address = {},
author = {Karan Aggarwal and Swaraj Khadanga and Shafiq Joty and Louis Kazaglis and Jaideep Srivastava},
booktitle = {IEEE Big Data 2018},
publisher = {IEEE},
title = {A Structured Learning Approach with Neural Conditional Random Fields for Sleep Staging},
url = {papers/karan-et-al-ieee-big-data-18.pdf},
year = {2018}
}
@inproceedings{Ebraheem-et-al-pvldb-18,
abstract = {Despite the efforts in 70+ years in all aspects of Entity res- olution (ER), there is still a high demand for democratizing ER – by reducing the heavy human involvement in label- ing data, performing feature engineering, tuning parameters, and defining blocking functions. With the recent advances in deep learning, in particular distributed representations of words (a.k.a. word embeddings), we present a novel ER sys- tem, called DeepER, that achieves good accuracy, high effi- ciency, as well as ease-of-use (i.e., much less human efforts). We use sophisticated composition methods, namely uni- and bi-directional recurrent neural networks (RNNs) with long short term memory (LSTM) hidden units, to convert each tuple to a distributed representation (i.e., a vector), which can in turn be used to effectively capture similarities be- tween tuples. We consider both the case where pre-trained word embeddings are available as well the case where they are not; we present ways to learn and tune the distributed representations that are customized for a specific ER task under different scenarios. We propose a locality sensitive hashing (LSH) based blocking approach that takes all at- tributes of a tuple into consideration and produces much smaller blocks, compared with traditional methods that con- sider only a few attributes. We evaluate our algorithms on multiple datasets (including benchmarks, biomedical data, as well as multi-lingual data) and the extensive experimental results show that DeepER outperforms existing solutions.},
address = {Rio de Janeiro, Brazil},
author = {Muhammad Ebraheem and Saravanan Thirumuruganathan and Shafiq Joty and Mourad Ouzzani and Nan Tang},
booktitle = {The Forty-fourth International Conference on Very Large Data Bases},
month = {August},
number = {10},
pages = {1454 -- 1467},
publisher = {},
series = {VLDB-2018},
title = {Distributed Representations of Tuples for Entity Resolution},
url = {http://www.vldb.org/pvldb/vol11/p1454-ebraheem.pdf},
volume = {11},
year = {2018}
}
@inproceedings{murray-joty-carenini-coling-18,
abstract = {The primary goal of this tutorial is for attendees to learn about recent work applying NLP to spoken
and written conversations, with a focus on computational models for three related topics: conversational structure, summarization and sentiment detection, and group dynamics. We provide examples of specific NLP tasks within those three areas, how they relate to one another, their applications, and how we evaluate task performance. We will begin by discussing motivations and applications of applying NLP methods to conversations, including downstream applications that could benefit. Attendees will hear about the challenges of working with noisy data, and examples of datasets of spoken and/or written conversations. The first part of the tutorial covers conversational structures, the basic building blocks for working with conversational data. Participants will learn about computational methods for uncovering thread and topic structures of a conversation, detecting dialogue acts and adjacency pairs, identifying participant roles (where relevant), and how to treat disfluencies. We will cover methods for both synchronous (e.g., meeting, phone) and asynchronous (e.g., forum, email) conversations. In the second part of the tutorial, we will focus on sentiment analysis and summarization. Attendees will learn about the related, overlapping tasks of detecting sentiment, subjectivity, and opinions. We will cover unsupervised and supervised approaches, as well as multimodal sentiment detection. Participants will learn about intrinsic vs. extrinsic evaluation of sentiment analysis methods for conversations. For summarization, we will cover core topics, such as the notions of extractive vs. abstractive summarization, and summarization vs. compression. In particular, participants will learn about the limits of extractive summarization on noisy and opinion-filled conversation data. We will particularly emphasize the question of how to evaluate automatically generated summaries, including some of the controversial history surrounding automatic summarization metrics that are widely used. In the final part of the tutorial, participants will learn about the growing field of research that uses NLP and machine learning methods to model and predict group dynamics, including prediction of group performance and participant affect. Attendees will learn about the close relationship between these three areas of summarization, sentiment, and group dynamics, and why researchers in each one of those areas often end up being concerned with the other two topics as well. Finally, we will discuss promising current and future directions of applying NLP to conversations.},
address = {Santa Fe, New Maxico, USA},
author = {Gabriel Murray* and
Shafiq Joty* and Giuseppe Carenini*},
booktitle = {Proceedings of the 27th International Conference on Computational Linguistics: Tutorial Abstracts},
month = {August},
pages = {1--4},
series = {COLING'18},
title = {NLP for Conversations: Sentiment, Summarization, and Group Dynamics},
url = {https://sites.google.com/view/nlpforconversations},
year = {2018}
}
@inproceedings{joty-et-al-icdm-18,
abstract = {Discourse processing is a suite of Natural Language
Processing (NLP) tasks to uncover linguistic structures from texts at several levels, which can support many text mining applications. This involves identifying the topic structure, the coherence structure, the coreference structure, and the conversation structure for conversational discourse. Taken together, these structures can inform text summarization, essay scoring, sentiment analysis, machine translation, information extraction, question answering, and thread recovery. The tutorial starts with an overview of basic concepts in discourse analysis – monologue vs. conversation, synchronous vs. asynchronous conversation, and key linguistic structures in discourse analysis. It then covers traditional machine learning methods along with the most recent works using deep learning, and compare their performances on benchmark datasets. For each discourse structure we describe, we show its applications in downstream text mining tasks.},
address = {Singapore},
author = {Shafiq Joty and Giuseppe Carenini and Raymond Ng and Gabriel Murray},
booktitle = {IEEE International Conference on Data Mining: Tutorial Abstracts},
month = {November},
pages = {1--2},
series = {ICDM'18},
title = {Discourse Processing and Its Applications in Text
Mining},
url = {https://ntunlpsg.github.io/project/icdmtutorial/},
year = {2018}
}
@inproceedings{firoj-et-al-icwsm-18,
abstract = {During time-critical situations such as natural disasters, rapid
classification of data posted on social networks by affected people is useful for humanitarian organizations to gain situational awareness and to plan response efforts. However, the scarcity of labeled data in the early hours of a crisis hinders machine learning tasks thus delays crisis response. In this work, we propose to use an inductive semi-supervised technique to utilize unlabeled data, which is often abundant at the onset of a crisis event, along with fewer labeled data. Specifically, we adopt a graph-based deep learning framework to learn an inductive semi-supervised model. We use two realworld crisis datasets from Twitter to evaluate the proposed approach. Our results show significant improvements using unlabeled data as compared to only using labeled data.},
address = {Stanford, California},
author = {Firoj Alam and Shafiq Joty and Muhammad Imran},
booktitle = {Proceedings of the Twelfth International Conference on Web and Social
Media},
month = {June},
pages = {556 -- 559},
publisher = {AAAI},
series = {ICWSM'18},
title = {Graph Based Semi-supervised Learning with Convolutional Neural Networks to Classify Crisis Related Tweets},
url = {https://www.aaai.org/ocs/index.php/ICWSM/ICWSM18/paper/download/17815/17049},
year = {2018}
}
@article{joty-guzman-marquez-nakov-cl-17,
abstract = {In this article, we explore the potential of using sentence-level discourse structure for machine translation evaluation. We first design discourse-aware similarity measures, which use all- subtree kernels to compare discourse parse trees in accordance with the Rhetorical Structure Theory (RST). Then, we show that a simple linear combination with these measures can help improve various existing machine translation evaluation metrics regarding correlation with human judgments both at the segment- and at the system-level. This suggests that discourse information is complementary to the information used by many of the existing evaluation metrics, and thus it could be taken into account when developing richer evaluation metrics, such as the WMT-14 winning combined metric DISCOTKparty. We also provide a detailed analysis of the relevance of various discourse elements and relations from the RST parse trees for machine translation evaluation. In particular, we show that (i) all aspects of the RST tree are relevant, (ii) nuclearity is more useful than relation type, and (iii) the similarity of the translation RST tree to the reference RST tree is positively correlated with translation quality.},
author = {Shafiq Joty and Guzm\'{a}n, Francisco and Màrquez, Lluís and Preslav Nakov},
journal = {Computational Linguistics},
pages = {683--722},
publisher = {MIT Press},
title = {{Discourse Structure in Machine Translation Evaluation}},
url = {http://www.mitpressjournals.org/doi/pdfplus/10.1162/COLI_a_00298},
volume = {43:4},
year = {2017}
}
@article{joty-durrani-sajjad-abdelali-csl-17,
abstract = {We explore neural joint models for the task of domain adaptation in machine translation in two ways: (i) we apply state-of-the-art domain adaptation techniques, such as mixture modelling and data selection using the recently proposed Neural Network Joint Model (NNJM) (Devlin et al., 2014); (ii) we propose two novel approaches to perform adaptation through instance weighting and weight readjustment in the NNJM framework. In our first approach, we propose a pair of models called Neural Domain Adaptation Models (NDAM) that minimizes the cross entropy by regularizing the loss function with respect to in-domain (and optionally to out-domain) model. In the second approach, we present a set of Neural Fusion Models (NFM) that combines the in- and the out-domain models by readjusting their parameters based on the in-domain data.
We evaluated our models on the standard task of translating English-to-German and Arabic-to-English TED talks. The NDAM models achieved better perplexities and modest BLEU improvements compared to the baseline NNJM, trained either on in-domain or on a concatenation of in- and out-domain data. On the other hand, the NFM models obtained significant improvements of up to +0.9 and +0.7 BLEU points, respectively. We also demonstrate improvements over existing adaptation methods such as instance weighting, phrasetable fill-up, linear and log-linear interpolations.},
author = {Shafiq Joty and Nadir Durrani and Hassan Sajjad and Ahmed Abdelali},
doi = {https://doi.org/10.1016/j.csl.2016.12.006},
issn = {0885-2308},
journal = {Computer Speech & Language (Special Issue on Deep Learning for Machine Translation)},
pages = {161-179},
publisher = {Elsevier},
title = {{Domain Adaptation Using Neural Network Joint Model}},
url = {http://www.sciencedirect.com/science/article/pii/S0885230816301474},
volume = {45:C},
year = {2017}
}
@article{guzman-joty-marquez-nakov-csl-16,
abstract = {We present a framework for machine translation evaluation using neural networks in a pairwise setting,
where the goal is to select the better translation from a pair of hypotheses, given the reference translation. In this framework, lexical, syntactic and semantic information from the reference and the two hypotheses is embedded into compact distributed vector representations, and fed into a multi-layer neural network that models nonlinear interactions between each of the hypotheses and the reference, as well as between the two hypotheses. We experiment with the benchmark datasets from the \{WMT\} Metrics shared task, on which we obtain the best results published so far, with the basic network configuration. We also perform a series of experiments to analyze and understand the contribution of the different components of the network. We evaluate variants and extensions, including fine-tuning of the semantic embeddings, and sentence-based representations modeled with convolutional and recurrent neural networks. In summary, the proposed framework is flexible and generalizable, allows for efficient learning and scoring, and provides an \{MT\} evaluation metric that correlates with human judgments, and is on par with the state of the art. },
author = {Guzm\'{a}n, Francisco and Joty, Shafiq and Màrquez, Lluís and Nakov, Preslav },
doi = {http://dx.doi.org/10.1016/j.csl.2016.12.005},
issn = {0885-2308},
journal = {Computer Speech & Language (Special Issue on Deep Learning for Machine Translation)},
pages = {180--200},
title = {Machine translation evaluation with neural networks},
url = {http://www.sciencedirect.com/science/article/pii/S0885230816301693},
volume = {45:C},
year = {2017}
}
@inproceedings{nguyen-joty-acl-17,
abstract = {We propose a local coherence model based on a convolutional neural network that operates over the entity grid representation of a text. The model captures long range entity transitions along with entity-specific features without loosing generalization, thanks to the power of distributed representation. We present a pairwise ranking method to train the model in an end-to-end fashion on a task and learn task-specific high level features. Our evaluation on three different coherence assessment tasks demonstrates that our model achieves state of the art results outperforming existing models by a good margin.},
address = {Vancouver, Canada},
author = {Dat Nguyen* and Shafiq Joty*},
booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
month = {August},
pages = {1320--1330},
publisher = {Association for Computational Linguistics},
series = {ACL'17},
title = {A Neural Local Coherence Model},
url = {papers/nguyen-joty-acl-17.pdf},
year = {2017}
}
@inproceedings{joty-nakov-marquez-jaradat-conll-17,
abstract = {We propose a local coherence model based on a convolutional neural network that operates over the entity grid representation of a text. The model captures long range entity transitions along with entity-specific features without loosing generalization, thanks to the power of distributed representation. We present a pairwise ranking method to train the model in an end-to-end fashion on a task and learn task-specific high level features. Our evaluation on three different coherence assessment tasks demonstrates that our model achieves state of the art results outperforming existing models by a good margin.},
address = {Vancouver, Canada},
author = {Shafiq Joty and Preslav Nakov and Lluís Màrquez and Israa Jaradat},
booktitle = {Proceedings of The SIGNLL Conference on Computational Natural Language Learning},
month = {August},
pages = {226--237},
publisher = {Association for Computational Linguistics},
series = {CoNLL'17},
title = {Cross-language Learning with Adversarial Neural Networks: Application to Community Question Answering},
url = {papers/joty-nakov-marquez-jaradat-conll-17.pdf},
year = {2017}
}
@inproceedings{saha-joty-hasan-ecml-17,
abstract = {We present a novel approach to learn distributed representation of sentences from unlabeled data by modeling both content and context of a sentence. The content model learns sentence representation by predicting its words. On the other hand, the context model comprises a neighbor prediction component and a regularizer to model distributional and proximity hypotheses, respectively. We propose an online algorithm to train the model components jointly. We evaluate the models in a setup, where contextual information is available. The experimental results on tasks involving classification, clustering, and ranking of sentences show that our model outperforms the best existing models by a wide margin across multiple datasets.},
address = {Macedonia, Skopje},
author = {Tanay Saha and Shafiq Joty and Mohammad Hasan},
booktitle = {Proceedings of The European Conference on Machine Learning &
Principles and Practice of knowledge discovery in databases},
month = {September},
pages = {xx--xx},
publisher = {Springer},
series = {ECML-PKDD'17},
title = {CON-S2V: A Generic Framework for Incorporating Extra-Sentential Context into Sen2Vec},
url = {papers/saha-joty-hasan-ecml-17.pdf},
year = {2017}
}
@inproceedings{saha-joty-hassan-hasan-cikm-17,
abstract = {Vector representation of sentences is important for many text processing tasks that involve classifying, clustering, or ranking sentences. For solving these tasks, bag-of-word based
representation has been used for a long time. In recent years, distributed representation of sentences learned by neural models from unlabeled data has been shown to outperform traditional bag-of-words representations. However, most existing methods belonging to the neural models consider only the content of a sentence, and disregard its relations with other sentences in the context. In this paper, we first characterize two types of contexts depending on their scope and utility. We then propose two approaches to incorporate contextual information into content-based models. We evaluate our sentence representation models in a setup, where context is available to infer sentence vectors. Experimental results demonstrate that our proposed models outshine existing models on three fundamental tasks, such as, classifying, clustering, and ranking sentences.},
address = {Singapore},
author = {Tanay Saha and Shafiq Joty and Naeemul Hassan and Mohammad Hasan},
booktitle = {Proceedings of the 26th ACM International Conference on Information and Knowledge Management},
month = {November},
pages = {xx--xx},
publisher = {ACM},
series = {CIKM'17},
title = {Regularized and Retrofitted models for Learning Sentence Representation with Context},
url = {papers/saha-joty-hassan-hasan-cikm-17.pdf},
year = {2017}
}
@inproceedings{martino-et-al-sigir-17,
abstract = {We study how to find relevant questions in community forums when the language of the new questions is different from that of the existing questions in the forum. In particular, we explore the Arabic-English language pair. We compare a kernel-based system with a feed-forward neural network in a scenario where a large parallel corpus is available for training a machine translation system, bilingual dictionaries, and cross-language word embeddings. We observe that both approaches degrade the performance of the system when working on the translated text, especially the kernel-based system, which depends heavily on a syntactic kernel. We address this issue using a cross-language tree kernel, which compares the original Arabic tree to the English trees of the related questions. We show that this kernel almost closes the performance gap with respect to the monolingual system. On the neural network side, we use the parallel corpus to train cross-language embeddings, which we then use to represent the Arabic input and the English related questions in the same space. The results also improve to close to those of the monolingual neural network. Overall, the kernel system shows a better performance compared to the neural network in all cases.},
address = {Tokyo, Japan},
author = {Giovanni Da San Martino and Salvatore Romeo and Alberto Barron-Cedeno and Shafiq Joty and Lluís Màrquez and Alessandro Moschitti and Preslav Nakov},
booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval},
month = {September},
pages = {1145--1148},
publisher = {ACM},
series = {SIGIR'17},
title = {Cross-Language Question Re-ranking},
url = {papers/martino-et-al-sigir-17.pdf},
year = {2017}
}
@inproceedings{nguyen-et-al-icwsm-17,
abstract = {The role of social media, in particular microblogging platforms such as Twitter, as a conduit for actionable and tactical information during disasters is increasingly acknowledged. However, time-critical analysis of big crisis data on social media streams brings challenges to machine learning techniques, especially the ones that use supervised learning. The scarcity of labeled data, particularly in the early hours of a crisis, delays the learning process. Existing classification methods require a significant amount of labeled data specific to a particular event for training plus a lot of feature engineering to achieve best results. In this work, we introduce neural network based classification methods for identifying useful tweets during a crisis situation. At the onset of a disaster when no labeled data is available, our proposed method makes the best use of the out-of-event data and achieves good results.},
address = {Montr{\'{e}}al, Qu{\'{e}}bec, Canada},
author = {Dat Nguyen and Kamela Ali Al Mannai and Shafiq Joty and Hassan Sajjad and Muhammad Imran and Prasenjit Mitra},
booktitle = {Proceedings of the Eleventh International Conference on Web and Social
Media},
pages = {xx--xx},
publisher = {AAAI},
series = {ICWSM'17},
title = {Robust Classification of Crisis-Related Data on Social Networks Using Convolutional Neural Networks},
url = {papers/nguyen-et-al-icwsm-17.pdf},
year = {2017}
}
@inproceedings{hoque-et-al-iui-17,
abstract = {Community question answering (CQA) forums can provide effective means for sharing information and addressing a user's information needs about particular topics. However, many such online forums are not moderated, resulting in many low quality and redundant comments, which makes it very challenging for users to find the appropriate answers to their questions. In this paper, we apply a user-centered design approach to develop a system, CQAVis, which supports users in identifying high quality comments and get their questions answered. Informed by the user's requirements, the system combines both text analytics and interactive visualization techniques together in a synergistic way. Given a new question posed by the user, the text analytic module automatically finds relevant answers by exploring existing related questions and the comments within their threads. Then the visualization module presents the search results to the user and supports the exploration of related comments. We have evaluated the system in the wild by deploying it within a CQA forum among thousands of real users. Through the online study, we gained deeper insights about the potential utility of the system, as well as learned generalizable lessons for designing visual text analytics systems for the domain of CQA forums.},
address = {Limassol, Cyprus},
author = {Enamul Hoque and Shafiq Joty and Lluís Màrquez and Giuseppe Carenini},
booktitle = {Proceedings of the 2017 international conference on Intelligent user interfaces},
pages = {161--172},
publisher = {ACM},
series = {IUI'17},
title = {{CQAVis: Visual Text Analytics for Community Question Answering}},
url = {http://dl.acm.org/citation.cfm?id=3025210},
year = {2017}
}
@inproceedings{nguyen-joty-boussaha-rijke-neuir-17,
abstract = {Discussion forums are an important source of information. They are often used to answer specific questions a user might have and to discover more about a topic of interest. Discussions in these forums may evolve in intricate ways, making it difficult for users to follow the flow of ideas. We propose a novel approach for automatically identifying the underlying thread structure of a forum discussion. Our approach is based on a neural model that computes coherence scores of possible reconstructions and then selects the highest scoring, i.e., the most coherent one. Preliminary experiments demonstrate promising results outperforming a number of strong baseline methods.},
address = {Tokyo, Japan},
author = {Dat Nguyen and Shafiq Joty and Basma Boussaha and Maarten Rijke},
booktitle = {Proceedings of the Neu-IR 2017 SIGIR Workshop on Neural Information Retrieval},
month = {Aug},
pages = {xx--xx},
publisher = {ACM},
series = {NeuIR'17},
title = {Thread Reconstruction in Conversational Data using Neural Coherence Models},
url = {papers/nguyen-joty-boussaha-rijke-neuir-17.pdf},
year = {2017}
}
@article{sathyanarayana-et-al-jmu-16,
abstract = {BACKGROUND:
The importance of sleep is paramount to health. Insufficient sleep can reduce physical, emotional, and mental well-being and can lead to a multitude of health complications among people with chronic conditions. Physical activity and sleep are highly interrelated health behaviors. Our physical activity during the day (ie, awake time) influences our quality of sleep, and vice versa. The current popularity of wearables for tracking physical activity and sleep, including actigraphy devices, can foster the development of new advanced data analytics. This can help to develop new electronic health (eHealth) applications and provide more insights into sleep science. OBJECTIVE: The objective of this study was to evaluate the feasibility of predicting sleep quality (ie, poor or adequate sleep efficiency) given the physical activity wearable data during awake time. In this study, we focused on predicting good or poor sleep efficiency as an indicator of sleep quality. METHODS: Actigraphy sensors are wearable medical devices used to study sleep and physical activity patterns. The dataset used in our experiments contained the complete actigraphy data from a subset of 92 adolescents over 1 full week. Physical activity data during awake time was used to create predictive models for sleep quality, in particular, poor or good sleep efficiency. The physical activity data from sleep time was used for the evaluation. We compared the predictive performance of traditional logistic regression with more advanced deep learning methods: multilayer perceptron (MLP), convolutional neural network (CNN), simple Elman-type recurrent neural network (RNN), long short-term memory (LSTM-RNN), and a time-batched version of LSTM-RNN (TB-LSTM). RESULTS: Deep learning models were able to predict the quality of sleep (ie, poor or good sleep efficiency) based on wearable data from awake periods. More specifically, the deep learning methods performed better than traditional logistic regression. “CNN had the highest specificity and sensitivity, and an overall area under the receiver operating characteristic (ROC) curve (AUC) of 0.9449, which was 46% better as compared with traditional logistic regression (0.6463). CONCLUSIONS: Deep learning methods can predict the quality of sleep based on actigraphy data from awake periods. These predictive models can be an important tool for sleep research and to improve eHealth solutions for sleep.},
author = {Aarti Sathyanarayana and Shafiq Joty and Luis Fernandez-Luque and Ferda Ofli and Jaideep Srivastava and Ahmed Elmagarmid and Shahrad Taheri and Teresa Arora},
doi = {10.2196/mhealth.6562},
journal = {JMIR mHealth and uHealth (JMU)},
number = {e125},
pmid = {27815231},
title = {{Sleep Quality Prediction From Wearable Data Using Deep Learning}},
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5116102/},
volume = {4(4)},
year = {2016}
}
@inproceedings{joty-hoque-acl-16,
abstract = {This paper addresses the problem of
speech act recognition in written asynchronous conversations (e.g., fora, emails). We propose a class of conditional structured models defined over arbitrary graph structures to capture the conversational dependencies between sentences. Our models use sentence representations encoded by a long short term memory (LSTM) recurrent neural model. Empirical evaluation shows the effectiveness of our approach over existing ones: (i) LSTMs provide better task-specific representations, and (ii) the global joint model improves over local models.},
address = {Berlin, Germany},
author = {Shafiq Joty and Enamul Hoque},
booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {1746--1756},
publisher = {ACL},
series = {ACL'16},
title = {Speech Act Modeling of Written Asynchronous Conversations with Task-Specific Embeddings and Conditional Structured Models},
url = {papers/joty-hoque-acl-16.pdf},
year = {2016}
}
@inproceedings{joty-marquez-nakov-naacl-16,
abstract = {This paper addresses the problem of comment
classification in community Question Answering. Following the state of the art, we approach the task with a global inference process to exploit the information of all comments in the answer-thread in the form of a fully connected graph. Our contribution comprises two novel joint learning models that are on-line and integrate inference within learning. The first one jointly learns two node- and edge-level MaxEnt classifiers with stochastic gradient descent and integrates the inference step with loopy belief propagation. The second model is an instance of fully connected pairwise CRFs (FCCRF). The FCCRF model significantly outperforms all other approaches and yields the best results on the task to date. Crucial elements for its success are the global normalization and an Ising-like edge potential.},
address = {San Diego, California},
author = {Shafiq Joty and M\`{a}rquez, Llu\'{i}s and Preslav Nakov},
booktitle = {Proceedings of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
numpages = {9},
pages = {703–-713},
publisher = {ACL},
series = {NAACL'16},
title = {Joint Learning with Global Inference for Comment Classification in Community Question Answering},
year = {2016}
}
@inproceedings{durrani-sajjad-joty-abdelali-coling-16,
abstract = {We present a novel fusion model for domain adaptation in Statistical Machine Translation. Our model is based on the joint source-target neural network (Devlin et al., 2014), and is learned
by fusing in- and out-domain models. The adaptation is performed by backpropagating errors from the output layer to the word embedding layer of each model, subsequently adjusting parameters of the composite model towards the in-domain data. On the standard tasks of translating English-to-German and Arabic-to-English TED talks, we observed average improvements of +0.9 and +0.7 BLEU points, respectively over a competition grade phrase-based system. We also demonstrate improvements over existing adaptation methods.},
address = {Osaka, Japan},
author = {Nadir Durrani and
Hassan Sajjad and Shafiq Joty and Ahmed Abdelali},
booktitle = {Proceedings of the 26th International Conference on Computational Linguistics},
month = {December},
pages = {3177--3187},
series = {COLING'16},
title = {A Deep Fusion Model for Domain Adaptation in Phrase-based {MT}},
url = {http://aclweb.org/anthology/C/C16/C16-1299.pdf},
year = {2016}
}
@inproceedings{hoque-et-al-coling-16-demo,
abstract = {We present an interactive system to provide effective and efficient search capabilities in Community Question Answering (cQA) forums. The system integrates state-of-the-art technology for
answer search with a Web-based user interface specifically tailored to support the cQA forum readers. The answer search module automatically finds relevant answers for a new question by exploring related questions and the comments within their threads. The graphical user interface presents the search results and supports the exploration of related information. The system is running live as a part of the Qatar Living forums.},
address = {Osaka, Japan},
author = {Hoque, Enamul and Joty, Shafiq and M\`{a}rquez, Llu\'{i}s and Barron-Cedeno, Alberto and Da San Martino, Giovanni and Moschitti, Alessandro and Nakov, Preslav and Romeo, Salvatore and Carenini, Giuseppe},
booktitle = {Proceedings of the 26th International Conference on Computational Linguistics: System Demonstrations},
month = {December},
pages = {1--5},
publisher = {The COLING 2016 Organizing Committee},
series = {COLING'16},
title = {An Interactive System for Exploring Community Question Answering Forums},
url = {http://aclweb.org/anthology/C16-2001},
year = {2016}
}
@inproceedings{cedeno-et-al-semeval-16,
abstract = {We describe our system, ConvKN, participating to the SemEval-2016 Task 3 " Community Question Answering ". The task targeted the reranking of questions and comments in real-life web fora both in English and Arabic. ConvKN combines convolutional tree kernels with convolutional neural networks and additional manually designed features including text similarity and thread specific features. For the first time, we applied tree kernels to syntactic trees of Arabic sentences for a reranking task. Our approaches obtained the second best results in three out of four tasks. The only task we performed averagely is the one where we did not use tree kernels in our classifier.},
address = {San Diego, California},
author = {Barron-Codeno, Alberto and Da San Martino, Giovanni and Joty, Shafiq and Moschitti, Alessandro and Al-Obaidli, Fahad and Romeo, Salvatore and Tymoshenko, Kateryna and Uva, Antonio},
booktitle = {Proceedings of the 10th International Workshop on Semantic Evaluation (SemEval-2016)},
month = {June},
pages = {896--903},
publisher = {Association for Computational Linguistics},
title = {ConvKN at SemEval-2016 Task 3: Answer and Question Selection for Question Answering on Arabic and English Fora},
url = {http://www.aclweb.org/anthology/S16-1138},
year = {2016}
}
@inproceedings{nguyen-et-al-swdm-16,
abstract = {During natural or man-made disasters, humanitarian response organizations look for useful information to support their decision-making processes. Social media platforms such as Twitter have been considered as a vital source of useful information for disaster response and management. Despite advances in natural language processing techniques, processing short and informal Twitter messages is a challenging task. In this paper, we propose to use Deep Neural Network (DNN) to address two types of information needs of response organizations: 1) identifying informative tweets and 2) classifying them into topical classes. DNNs use distributed representation of words and learn the representation as well as higher level features automatically for the classification task. We propose a new online algorithm based on stochastic gradient descent to train DNNs in an online fashion during disaster situations. We test our models using a crisis-related real-world Twitter dataset.},
address = {Indianapolis, USA},
author = {Dat Tien Nguyen and
Shafiq Joty and Muhammad Imran and Hassan Sajjad and Prasenjit Mitra},
booktitle = {4th international workshop on Social Web for Disaster Management},
series = {SWDM'16},
title = {Applications of Online Deep Learning for Crisis Response Using Social
Media Information},
url = {http://arxiv.org/abs/1610.01030},
volume = {abs/1610.01030},
year = {2016}
}
@article{joty-carenini-ng-cl-15,
abstract = {Clauses and sentences rarely stand on their own in an actual discourse; rather, the relationship
between them carries important information that allows the discourse to express a meaning as a whole beyond the sum of its individual parts. Rhetorical analysis seeks to uncover this coherence structure. In this article, we present CODRA— a COmplete probabilistic Discriminative framework for performing Rhetorical Analysis in accordance with Rhetorical Structure Theory, which posits a tree representation of a discourse. CODRA comprises a discourse segmenter and a discourse parser. First, the discourse segmenter, which is based on a binary classifier, identifies the elementary discourse units in a given text. Then the discourse parser builds a discourse tree by applying an optimal parsing algorithm to probabilities inferred from two Conditional Random Fields: one for intra-sentential parsing and the other for multi-sentential parsing. We present two approaches to combine these two stages of parsing effectively. By conducting a series of empirical evaluations over two different data sets, we demonstrate that CODRA significantly outperforms the state-of-the-art, often by a wide margin. We also show that a reranking of the k-best parse hypotheses generated by CODRA can potentially improve the accuracy even further.},
author = {Joty, Shafiq and Carenini, Giuseppe and Ng, Raymond T},
journal = {Computational Linguistics},
pages = {385-435},
publisher = {MIT Press},
title = {{CODRA: A Novel Discriminative Framework for Rhetorical Analysis}},
url = {papers/joty-carenini-ng-cl-15},
volume = {41:3},
year = {2015}
}
@inproceedings{guzman-joty-marquez-nakov-acl-15,
abstract = {We present a novel framework for machine translation evaluation using neural networks in a pairwise setting, where the goal is to select the better translation from a pair of hypotheses, given the reference translation. In this framework, lexical, syntactic and semantic information from the reference and the two hypotheses is compacted into relatively small distributed vector representations, and fed into a multi-layer neural network that models the interaction between each of the hypotheses and the reference, as well as between the two hypotheses. These compact representations are in turn based on word and sentence embeddings, which are learned using neural networks. The framework is flexible, allows for efficient learning and classification, and yields correlation with humans that rivals the state of the art.},
address = {Beijing, China},
author = {Guzm\'{a}n, Francisco and Joty, Shafiq and M\`{a}rquez, Llu\'{i}s and Nakov, Preslav },
booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and The 7th International Joint Conference of the Asian
Federation of Natural Language Processing},
month = {July},
pages = {805--814},
publisher = {Association for Computational Linguistics},
series = {ACL'15},
title = {Pairwise Neural Machine Translation Evaluation},
url = {http://www.aclweb.org/anthology/P15-1078},
year = {2015}
}
@inproceedings{cedeno-et-al-acl-15,
abstract = {Community Question Answering (cQA) is
a new application of QA in social contexts (e.g., fora). It presents new interesting challenges and research directions, e.g., exploiting the dependencies between the different comments of a thread to select the best answer for a given question. In this paper, we explored two ways of modeling such dependencies: (i) by designing specific features looking globally at the thread; and (ii) by applying structure prediction models. We trained and evaluated our models on data from SemEval-2015 Task 3 on Answer Selection in cQA. Our experiments show that: (i) the thread-level features consistently improve the performance for a variety of machine learning models, yielding state-of-the-art results; and (ii) sequential dependencies between the answer labels captured by structured prediction models are not enough to improve the results, indicating that more information is needed in the joint model.},
address = {Beijing, China},
author = {Barron-Cedeno, Alberto and Filice, Simone and Da San Martino, Giovanni and Joty, Shafiq and M\`{a}rquez, Llu\'{i}s and Nakov, Preslav and Moschitti, Alessandro},
booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing},
month = {July},
pages = {687--693},
publisher = {Association for Computational Linguistics},
series = {ACL'15},
title = {Thread-Level Information for Comment Classification in Community Question Answering},
url = {http://www.aclweb.org/anthology/P15-2113},
year = {2015}
}
@inproceedings{durrani-et-al-amta-15,
abstract = {Joint models have recently shown to improve the state-of-the-art in machine translation (MT).
We apply EM-based mixture modeling and data selection techniques using two joint models, namely the Operation Sequence Model or OSM — an ngram-based translation and reordering model, and the Neural Network Joint Model or NNJM — a continuous space translation model, to carry out domain adaptation for MT. The diversity of the two models, OSM with inherit reordering information and NNJM with continuous space modeling makes them interesting to be explored for this task. Our contribution in this paper is fusing the existing known techniques (linear interpolation, cross-entropy) with the state-of-the-art MT models (OSM, NNJM). On a standard task of translating German-to-English and Arabic-to-English IWSLT TED talks, we observed statistically significant improvements of up to +0.9 BLEU points.},
address = {Florida, USA},
author = {Nadir Durrani and Hassan Sajjad and Shafiq Joty and Ahmed Abdelali and Stephan Vogel},
booktitle = {Proceedings of the Association for Machine Translation in the Americas},
month = {November},
pages = {687--693},
series = {AMTA'15},
title = {Using Joint Models for Domain Adaptation in Statistical Machine Translation},
url = {papers/durrani-et-al-amta-15},
year = {2015}
}
@inproceedings{joty-et-al-emnlp-15-1,
abstract = {We present novel models for domain adaptation
based on the neural network joint model (NNJM). Our models maximize the cross entropy by regularizing the loss function with respect to in-domain model. Domain adaptation is carried out by assigning higher weight to out-domain sequences that are similar to the in-domain data. In our alternative model we take a more restrictive approach by additionally penalizing sequences similar to the outdomain data. Our models achieve better perplexities than the baseline NNJM models and give improvements of up to 0.5 and 0.6 BLEU points in Arabic-to-English and English-to-German language pairs, on a standard task of translating TED talks.},
address = {Lisbon, Portugal},
author = {Joty, Shafiq and Sajjad, Hassan and Durrani, Nadir and Al-Mannai, Kamla and Abdelali, Ahmed and Vogel, Stephan},
booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing},
pages = {1259--1270},
publisher = {ACL},
series = {EMNLP'15},
title = {How to Avoid Unwanted Pregnancies: Domain Adaptation using Neural Network Models},
url = {http://aclweb.org/anthology/D15-1147},
year = {2015}
}
@inproceedings{liu-joty-meng-emnlp-15,
abstract = {The tasks in fine-grained opinion mining
can be regarded as either a token-level sequence labeling problem or as a semantic compositional task. We propose a general class of discriminative models based on recurrent neural networks (RNNs) and word embeddings that can be successfully applied to such tasks without any taskspecific feature engineering effort. Our experimental results on the task of opinion target identification show that RNNs, without using any hand-crafted features, outperform feature-rich CRF-based models. Our framework is flexible, allows us to incorporate other linguistic features, and achieves results that rival the top performing systems in SemEval-2014.},
address = {Lisbon, Portugal},
author = {Liu, Pengfei and Joty, Shafiq and Meng, Helen},
booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing},
pages = {1433--1443},
publisher = {ACL},
series = {EMNLP'15},
title = {Fine-grained Opinion Mining with Recurrent Neural Networks and Word Embeddings},
url = {http://aclweb.org/anthology/D15-1168},
year = {2015}
}
@inproceedings{joty-et-al-emnlp-15-2,
abstract = {Community question answering, a recent
evolution of question answering in the Web context, allows a user to quickly consult the opinion of a number of people on a particular topic, thus taking advantage of the wisdom of the crowd. Here we try to help the user by deciding automatically which answers are good and which are bad for a given question. In particular, we focus on exploiting the output structure at the thread level in order to make more consistent global decisions. More specifically, we exploit the relations between pairs of comments at any distance in the thread, which we incorporate in a graph-cut and in an ILP frameworks. We evaluated our approach on the benchmark dataset of SemEval-2015 Task 3. Results improved over the state of the art, confirming the importance of using thread level information.},
address = {Lisbon, Portugal},
author = {Joty, Shafiq and Barron-Cedeno, Alberto and Da San Martino, Giovanni and Filice, Simone and M\`{a}rquez, Llu\'{i}s and Moschitti, Alessandro and Nakov, Preslav},
booktitle = {Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing},
pages = {573--578},
publisher = {ACL},
title = {Global Thread-level Inference for Comment Classification in Community Question Answering},
url = {https://aclweb.org/anthology/D/D15/D15-1068},
year = {2015}
}
@inproceedings{nicosia-et-al-semeval-15,
abstract = {This paper describes QCRI’s participation in
SemEval-2015 Task 3 “Answer Selection in Community Question Answering”, which targeted real-life Web forums, and was offered in both Arabic and English. We apply a supervised machine learning approach considering a manifold of features including among others word n-grams, text similarity, sentiment analysis, the presence of specific words, and the context of a comment. Our approach was the best performing one in the Arabic subtask and the third best in the two English subtasks.},
address = {Denver, Colorado, USA},
author = {Massimo Nicosia and Simone Filice and Alberto Barron-Cedeno and Iman Saleh and Hamdy Mubarak and Wei Gao and Preslav Nakov and Giovanni Da San Martino and Alessandro Moschitti and Kareem Darwish and Llu{\i}s M\`{a}rquez and Shafiq R. Joty and Walid Magdy},
booktitle = {Proceedings of the 9th International Workshop on Semantic Evaluation},
month = {June},
pages = {203--209},
series = {SemEval'15},
title = {{QCRI:} Answer Selection for Community Question Answering - Experiments
for Arabic and English},
url = {http://aclweb.org/anthology/S/S15/S15-2036.pdf},
year = {2015}
}
@inproceedings{guzman-joty-marquez-nakov-acl-14,
abstract = {We present experiments in using discourse structure for improving machine translation evaluation. We first design two discourse-aware similarity measures, which use all-subtree kernels to compare discourse parse trees in accordance with the Rhetorical Structure Theory. Then, we show that these measures can help improve a number of existing machine translation evaluation metrics both at the segmentand at the system-level. Rather than proposing a single new metric, we show that discourse information is complementary to the state-of-the-art evaluation metrics, and thus should be taken into account in the development of future richer evaluation metrics.},
address = {Baltimore, Maryland, USA},
author = {Guzm\'{a}n, Francisco and Joty, Shafiq and M\`{a}rquez, Llu\'{i}s and Nakov, Preslav},
booktitle = {Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics},
month = {June},
pages = {687--698},
publisher = {Association for Computational Linguistics},
series = {ACL'14},
title = {Using Discourse Structure Improves Machine Translation Evaluation},
url = {http://www.aclweb.org/anthology/P/P14/P14-1065},
year = {2014}
}
@inproceedings{guzman-et-al-emnlp-14,
abstract = {We present a pairwise learning-to-rank approach to machine translation evaluation that learns to differentiate better from worse translations in the context of a given reference. We integrate several layers of linguistic information encapsulated in tree-based structures, making use of both the reference and the system output simultaneously, thus bringing our ranking closer to how humans evaluate translations. Most importantly, instead of deciding upfront which types of features are important, we use the learning framework of preference re-ranking kernels to learn the features automatically. The evaluation results show that learning in the proposed framework yields better correlation with humans than computing the direct similarity over the same type of structures. Also, we show our structural kernel learning (SKL) can be a general framework for MT evaluation, in which syntactic and semantic information can be naturally incorporated.},
address = {Doha, Qatar},
author = {Guzm\'{a}n, Francisco and Joty, Shafiq and M\`{a}rquez, Llu\'{i}s and Moschitti, Alessandro and Nakov, Preslav and Nicosia, Massimo},
booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing},
month = {October},
pages = {214--220},
publisher = {Association for Computational Linguistics},
series = {EMNLP'14},
title = {Learning to Differentiate Better from Worse Translations},
url = {http://www.aclweb.org/anthology/D14-1027},
year = {2014}
}
@inproceedings{joty-guzman-marquez-nakov-wmt-14,
abstract = {We present novel automatic metrics for machine translation evaluation that use discourse structure and convolution kernels to compare the discourse tree of an automatic translation with that of the human reference. We experiment with five transformations and augmentations of a base discourse tree representation based on the rhetorical structure theory, and we combine the kernel scores for each of them into a single score. Finally, we add other metrics from the ASIYA MT evaluation toolkit, and we tune the weights of the combination on actual human judgments. Experiments on the WMT12 and WMT13 metrics shared task datasets show correlation with human judgments that outperforms what the best systems that participated in these years achieved, both at the segment and at the system level.},
address = {Baltimore, Maryland, USA},
author = {Joty, Shafiq and Guzm\'{a}n, Francisco and M\`{a}rquez, Llu\'{i}s and Nakov, Preslav},
booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation},
month = {June},
pages = {402--408},
publisher = {Association for Computational Linguistics},
series = {WMT'14},
title = {DiscoTK: Using Discourse Structure for Machine Translation Evaluation},
url = {http://www.aclweb.org/anthology/W/W14/W14-3352},
year = {2014}
}
@inproceedings{hoque-carenini-joty-illvi-14,
abstract = {Exploring an online conversation can be
very difficult for a user, especially when it becomes a long complex thread. We follow a human-centered design approach to tightly integrate text mining methods with interactive visualization techniques to support the users in fulfilling their information needs. The resulting visual text analytic system provides multifaceted exploration of asynchronous conversations. We discuss a number of open challenges and possible directions for further improvement including the integration of interactive human feedback in the text mining loop, applying more advanced text analysis methods with visualization techniques, and evaluating the system with real users.},
address = {Baltimore, Maryland, USA},
author = {Hoque, Enamul and Carenini, Giuseppe and Joty, Shafiq},
booktitle = {Proceedings of the Workshop on Interactive Language Learning, Visualization, and Interfaces},
month = {June},
pages = {45--52},
publisher = {Association for Computational Linguistics},
series = {ILLVI'14},
title = {Interactive Exploration of Asynchronous Conversations: Applying a User-centered Approach to Design a Visual Text Analytic System},
url = {http://www.aclweb.org/anthology/W14-3107},
year = {2014}
}
@inproceedings{joty-moschitti-emnlp-14,
abstract = {In this paper, we present a discriminative
approach for reranking discourse trees generated by an existing probabilistic discourse parser. The reranker relies on tree kernels (TKs) to capture the global dependencies between discourse units in a tree. In particular, we design new computational structures of discourse trees, which combined with standard TKs, originate novel discourse TKs. The empirical evaluation shows that our reranker can improve the state-of-the-art sentence-level parsing accuracy from 79.77% to 82.15%, a relative error reduction of 11.8%, which in turn pushes the state-of-the-art documentlevel accuracy from 55.8% to 57.3%.},
address = {Doha, Qatar},
author = {Joty, Shafiq and Moschitti, Alessandro},
booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing},
month = {October},
pages = {2049--2060},
publisher = {ACL},
series = {EMNLP'14},
title = {Discriminative Reranking of Discourse Parses Using Tree Kernels},
url = {http://www.aclweb.org/anthology/D14-1219},
year = {2014}
}
@inproceedings{saleh-et-al-emnlp-14,
abstract = {We present an empirical study on the use
of semantic information for Concept Segmentation and Labeling (CSL), which is an important step for semantic parsing. We represent the alternative analyses output by a state-of-the-art CSL parser with tree structures, which we rerank with a classifier trained on two types of semantic tree kernels: one processing structures built with words, concepts and Brown clusters, and another one using semantic similarity among the words composing the structure. The results on a corpus from the restaurant domain show that our semantic kernels exploiting similarity measures outperform state-of-the-art rerankers.},
address = {Doha, Qatar},
author = {Saleh, Iman and Moschitti, Alessandro and Nakov, Preslav and M\`{a}rquez, Llu\'{i}s and Joty, Shafiq},
booktitle = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing},
month = {October},
pages = {436--442},
publisher = {Association for Computational Linguistics},
series = {EMNLP'14},
title = {Semantic Kernels for Semantic Parsing},
url = {http://www.aclweb.org/anthology/D14-1050},
year = {2014}
}
@inproceedings{saleh-et-al-coling-14,
abstract = {This paper presents an empirical study on using syntactic and semantic information for Concept Segmentation and Labeling (CSL), a well-known component in spoken language understanding.
Our approach is based on reranking N-best outputs from a state-of-the-art CSL parser. We perform extensive experimentation by comparing different tree-based kernels with a variety of representations of the available linguistic information, including semantic concepts, words, POS tags, shallow and full syntax, and discourse trees. The results show that the structured representation with the semantic concepts yields significant improvement over the base CSL parser, much larger compared to learning with an explicit feature vector representation. We also show that shallow syntax helps improve the results and that discourse relations can be partially beneficial.},
address = {Dublin, Ireland},
author = {Saleh, Iman and Cyphers, Scott and Glass, Jim and Joty, Shafiq and M\`{a}rquez, Llu\'{i}s and Moschitti, Alessandro and Nakov, Preslav},
booktitle = {Proceedings of the 25th International Conference on Computational Linguistics: Technical Papers},
month = {August},
pages = {193--202},
publisher = {Dublin City University and Association for Computational Linguistics},
series = {COLING'14},
title = {A Study of using Syntactic and Semantic Structures for Concept Segmentation and Labeling},
url = {http://www.aclweb.org/anthology/C14-1020},
year = {2014}
}
@article{joty-carenini-ng-jair-13,
abstract = {Topic segmentation and labeling is often considered a prerequisite for higher-level conversation
analysis and has been shown to be useful in many Natural Language Processing (NLP) applications. We present two new corpora of email and blog conversations annotated with topics, and evaluate annotator reliability for the segmentation and labeling tasks in these asynchronous conversations. We propose a complete computational framework for topic segmentation and labeling in asynchronous conversations. Our approach extends state-of-the-art methods by considering a fine-grained structure of an asynchronous conversation, along with other conversational features by applying recent graph-based methods for NLP. For topic segmentation, we propose two novel unsupervised models that exploit the fine-grained conversational structure, and a novel graph-theoretic supervised model that combines lexical, conversational and topic features. For topic labeling, we propose two novel (unsupervised) random walk models that respectively capture conversation specific clues from two different sources: the leading sentences and the fine-grained conversational structure. Empirical evaluation shows that the segmentation and the labeling performed by our best models beat the state-of-the-art, and are highly correlated with human annotations.},
author = {Shafiq Joty and Giuseppe Carenini and Raymond Ng},
journal = {Journal of Artificial Intelligence Research},
pages = {521--573},
title = {{Topic Segmentation and Labeling in Asynchronous Conversations}},
url = {https://www.jair.org/media/3940/live-3940-7166-jair.pdf},
volume = {47},
year = {2013}
}
@inproceedings{joty-carenini-ng-mehdad-acl-13,
abstract = {We propose a novel approach for developing
a two-stage document-level discourse parser. Our parser builds a discourse tree by applying an optimal parsing algorithm to probabilities inferred from two Conditional Random Fields: one for intrasentential parsing and the other for multisentential parsing. We present two approaches to combine these two stages of discourse parsing effectively. A set of empirical evaluations over two different datasets demonstrates that our discourse parser significantly outperforms the stateof-the-art, often by a wide margin.},
address = {Sofia, Bulgaria},
author = {Joty, Shafiq and Carenini, Giuseppe and Ng, Raymond T. and Mehdad, Yashar},
booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},
numpages = {9},
pages = {486--496},
publisher = {ACL},
series = {ACL'13},
title = {{Combining Intra- and Multi-sentential Rhetorical Parsing for Document-level Discourse Analysis}},
year = {2013}
}
@inproceedings{mehdad-carenini-ng-joty-naacl-13,
abstract = {We propose a novel framework for topic labeling
that assigns the most representative phrases for a given set of sentences covering the same topic. We build an entailment graph over phrases that are extracted from the sentences, and use the entailment relations to identify and select the most relevant phrases. We then aggregate those selected phrases by means of phrase generalization and merging. We motivate our approach by applying over conversational data, and show that our framework improves performance significantly over baseline algorithms.},
address = {Atlanta, Georgia},
author = {Mehdad, Yashar and Carenini, Giuseppe and Ng, Raymond T. and Joty, Shafiq},
booktitle = {Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
month = {June},
pages = {179--189},
publisher = {ACL},
series = {NAACL'13},
title = {Towards Topic Labeling with Phrase Entailment and Aggregation},
year = {2013}
}
@inproceedings{tavafi-et-al-sigdial-13,
abstract = {In this work, we study the effectiveness of
state-of-the-art, sophisticated supervised learning algorithms for dialogue act modeling across a comprehensive set of different spoken and written conversations including: emails, forums, meetings, and phone conversations. To this aim, we compare the results of SVM-multiclass and two structured predictors namely SVMhmm and CRF algorithms. Extensive empirical results, across different conversational modalities, demonstrate the effectiveness of our SVM-hmm model for dialogue act recognition in conversations.},
address = {Metz, France},
author = {Tavafi, Maryam and Mehdad, Yashar and Joty, Shafiq and Carenini, Giuseppe and Ng, Raymond},
booktitle = {Proceedings of the Special Interest Group on Discourse and Dialogue Conference},
month = {August},
pages = {117--121},
publisher = {Association for Computational Linguistics},
series = {SIGDIAL'13},
title = {Dialogue Act Recognition in Synchronous and Asynchronous Conversations},
url = {http://www.aclweb.org/anthology/W13-4017},
year = {2013}
}
@inproceedings{joty-carenini-ng-emnlp-12,
abstract = {We propose a complete probabilistic discriminative
framework for performing sentencelevel discourse analysis. Our framework comprises a discourse segmenter, based on a binary classifier, and a discourse parser, which applies an optimal CKY-like parsing algorithm to probabilities inferred from a Dynamic Conditional Random Field. We show on two corpora that our approach outperforms the state-of-the-art, often by a wide margin.},
address = {Jeju Island, Korea},
author = {Joty, Shafiq and Carenini, Giuseppe and Ng, Raymond},
booktitle = {Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
pages = {904--915},
publisher = {ACL},
series = {EMNLP-CoNLL'12},
title = {A Novel Discriminative Framework for Sentence-Level Discourse Analysis
},
url = {http://www.aclweb.org/anthology/D12-1083},
year = {2012}
}
@inproceedings{joty-carenini-ng-nwnlp-12,
address = {Microsoft Research, Redmond},
author = {Shafiq Joty and Giuseppe Carenini and Raymond Ng},
booktitle = {The Pacific Northwest Regional NLP Workshop},
series = {NWNLP'12},
title = {{Automatic Topic Labeling in Asynchronous Conversations}},
url = {papers/joty-carenini-ng-nwnlp-12},
year = {2012}
}
@inproceedings{jin-joty-carenini-ng-nwnlp-12,
address = {Microsoft Research, Redmond},
author = {Wei Jin and Shafiq Joty and Giuseppe Carenini and Raymond Ng},
booktitle = {The Pacific Northwest Regional NLP Workshop},
series = {NWNLP'12},
title = {{Detecting Informative Blog Comments using Tree Structured Conditional Random Fields}},
url = {papers/jin-joty-carenini-ng-nwnlp-12},
year = {2012}
}