660 lines
17 KiB
Ruby
660 lines
17 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module RBMark
|
|
# Parser class
|
|
class Parser
|
|
def initialize(variants, default)
|
|
@default = default
|
|
@variants = variants
|
|
@markers = @variants.map { |x| [x.begin, x] }.to_h
|
|
end
|
|
|
|
# Parse text using the given variants
|
|
# @param text [String]
|
|
# @return [Array<::RBMark::DOM::DOMObject>]
|
|
def parse(text)
|
|
chunks = []
|
|
until text.nil? or text.empty?
|
|
before, chunk, text = get_chunk(text)
|
|
chunks.append(@default.parse(before)) unless before.empty?
|
|
next unless chunk
|
|
|
|
chunks.append(chunk)
|
|
end
|
|
chunks.flatten
|
|
end
|
|
|
|
private
|
|
|
|
def get_chunk(text)
|
|
element, match = get_element(text)
|
|
if element
|
|
chunk, after = finalize_element(text[match.offset(0)[0]..], element)
|
|
return [match.pre_match, chunk, after] if chunk
|
|
|
|
return [match.pre_match, nil, match.post_match]
|
|
end
|
|
[text, nil, nil]
|
|
end
|
|
|
|
def get_element(text)
|
|
@markers.filter_map do |marker|
|
|
[marker[1], text.match(marker[0])] if text.match(marker[0])
|
|
end.min_by { |x| x[1].offset(0)[0] }
|
|
end
|
|
|
|
def finalize_element(text, element)
|
|
match = text.match(element.end)
|
|
return nil, nil unless match
|
|
|
|
chunk_text = text[..(match.offset(0)[1] - 1)]
|
|
remaining_text = text[match.offset(0)[1]..]
|
|
[element.parse(chunk_text), remaining_text]
|
|
end
|
|
end
|
|
|
|
# Module for representing abstract object hierarchy
|
|
module DOM
|
|
# Abstract container
|
|
class DOMObject
|
|
class << self
|
|
# Hook for initializing variables
|
|
# @param subclass [Class]
|
|
def inherited(subclass)
|
|
super
|
|
@subclasses ||= []
|
|
@subclasses.append(subclass)
|
|
subclass.variants = @variants.dup
|
|
subclass.variants ||= []
|
|
subclass.default_class = @default_class
|
|
end
|
|
|
|
# Add potential sub-element variant
|
|
# @param cls [Class] DOMObject subclass
|
|
def variant(cls)
|
|
unless cls < ::RBMark::DOM::DOMObject
|
|
raise StandardError, "#{cls} is not a DOMObject subclass"
|
|
end
|
|
|
|
@variants.append(cls)
|
|
@subclasses&.each do |subclass|
|
|
subclass.variant(cls)
|
|
end
|
|
end
|
|
|
|
# Set default element class
|
|
# @param cls [Class] DOMObject subclass
|
|
def default(cls)
|
|
unless cls < ::RBMark::DOM::DOMObject
|
|
raise StandardError, "#{cls} is not a DOMObject subclass"
|
|
end
|
|
|
|
@default_class = cls
|
|
@subclasses&.each do |subclass|
|
|
subclass.default(cls)
|
|
end
|
|
end
|
|
|
|
# Parse text from the given context
|
|
# @param text [String]
|
|
# @return [self]
|
|
def parse(text)
|
|
parser = ::RBMark::Parser.new(@variants, @default_class)
|
|
instance = create
|
|
instance.append(*parser.parse(text))
|
|
instance
|
|
end
|
|
|
|
# Create a new instance of class or referenced class
|
|
# @return [self, Class]
|
|
def create
|
|
if @alt_for
|
|
@alt_for.new
|
|
else
|
|
new
|
|
end
|
|
end
|
|
|
|
attr_accessor :variants, :begin, :end, :default_class, :alt_for
|
|
end
|
|
|
|
def initialize
|
|
@content = nil
|
|
@children = []
|
|
@properties = {}
|
|
end
|
|
|
|
# Set certain property in the properties hash
|
|
# @param properties [Hash] proeprties to update
|
|
def property(**properties)
|
|
@properties.update(**properties)
|
|
end
|
|
|
|
# Add child to container
|
|
# @param child [DOMObject]
|
|
def append(*children)
|
|
unless children.all? { |x| x.is_a? DOMObject }
|
|
raise StandardError, "one of #{children.inspect} is not a DOMObject"
|
|
end
|
|
|
|
@children.append(*children)
|
|
end
|
|
|
|
# Insert a child into the container
|
|
# @param child [DOMObject]
|
|
# @param index [Integer]
|
|
def insert(index, child)
|
|
raise StandardError, "not a DOMObject" unless child.is_a? DOMObject
|
|
|
|
@children.insert(index, child)
|
|
end
|
|
|
|
# Delete a child from container
|
|
# @param index [Integer]
|
|
def delete_at(index)
|
|
@children.delete_at(index)
|
|
end
|
|
|
|
# Get a child from the container
|
|
# @param key [Integer]
|
|
def [](key)
|
|
@children[key]
|
|
end
|
|
|
|
# Set text content of a DOMObject
|
|
# @param text [String]
|
|
def content=(text)
|
|
raise StandardError, "not a String" unless text.is_a? String
|
|
|
|
@content = text
|
|
end
|
|
|
|
# Get length of text contents
|
|
# @return [Integer]
|
|
def length
|
|
@children.map(&:length).sum
|
|
end
|
|
|
|
# Get text content of a DOMObject
|
|
# @return [String, nil]
|
|
attr_reader :content, :children, :properties
|
|
end
|
|
|
|
# Inline text
|
|
class Text < DOMObject
|
|
# Stub parser for inline text element
|
|
# @param text [String]
|
|
# @return [self]
|
|
def self.parse(text)
|
|
instance = new
|
|
instance.content = text.gsub("\n", ' ').gsub(/\s+/, " ")
|
|
instance
|
|
end
|
|
|
|
# Get length of inline text
|
|
# @return [Integer]
|
|
def length
|
|
@content.length
|
|
end
|
|
end
|
|
|
|
# Inline preformatted text
|
|
class InlinePre < DOMObject
|
|
self.begin = /(?<!\\)`(?!`)/
|
|
self.end = /`.+?`/
|
|
|
|
# Stub parser for inline text element
|
|
# @param text [String]
|
|
# @return [self]
|
|
def self.parse(text)
|
|
instance = new
|
|
element = ::RBMark::DOM::Text.new
|
|
element.content = text
|
|
instance.append element
|
|
instance
|
|
end
|
|
|
|
# Get length of inline preformatted text
|
|
# @return [Integer]
|
|
def length
|
|
@content ? @content.length : 0
|
|
end
|
|
end
|
|
|
|
# Infline formattable text
|
|
class InlineFormattable < DOMObject
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
cleanup(super(text))
|
|
end
|
|
|
|
# Clean up internal text chunks
|
|
def self.cleanup(paragraph)
|
|
previous = nil
|
|
delete = []
|
|
paragraph.children.each_with_index do |child, index|
|
|
if previous.nil? || previous.is_a?(::RBMark::DOM::InlineBreak)
|
|
if child.is_a? ::RBMark::DOM::Text
|
|
child.content = child.content.lstrip
|
|
end
|
|
delete.append(index) if child.is_a? ::RBMark::DOM::InlineBreak
|
|
end
|
|
previous = child
|
|
end
|
|
delete.reverse_each do |index|
|
|
paragraph.delete_at(index)
|
|
end
|
|
paragraph
|
|
end
|
|
end
|
|
|
|
# Bold text
|
|
class InlineBold < InlineFormattable
|
|
self.begin = /(?<!\\)\*\*(?!\*\*)/
|
|
self.end = /\*\*.+?\*\*/
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
super(text[2..-3])
|
|
end
|
|
end
|
|
|
|
# Italics text
|
|
class InlineItalics < InlineFormattable
|
|
self.begin = /(?<!\\)\*(?!\*)/
|
|
self.end = /\*.+?\*/
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
super(text[1..-2])
|
|
end
|
|
end
|
|
|
|
# Inline italics text (alternative)
|
|
class InlineAltItalics < InlineFormattable
|
|
self.begin = /(?<!\\)_(?!_)/
|
|
self.end = /_.+?_/
|
|
self.alt_for = ::RBMark::DOM::InlineItalics
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
super(text[1..-2])
|
|
end
|
|
end
|
|
|
|
# Underline text
|
|
class InlineUnder < InlineFormattable
|
|
self.begin = /(?<!\\)__(?!__)/
|
|
self.end = /__.+?__/
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
super(text[2..-3])
|
|
end
|
|
end
|
|
|
|
# Strikethrough text
|
|
class InlineStrike < InlineFormattable
|
|
self.begin = /(?<!\\)~~(?!~~)/
|
|
self.end = /~~.+?~~/
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
super(text[2..-3])
|
|
end
|
|
end
|
|
|
|
# Hyperreferenced text
|
|
class InlineLink < InlineFormattable
|
|
self.begin = /(?<![!\\])\[[^\]]+?\]\([^)]+?\)/
|
|
self.end = self.begin
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
text, link = text.match(/\[([^\]]+?)\]\(([^)]+?)\)/)[1..2]
|
|
instance = super(text)
|
|
instance.property link: link
|
|
instance
|
|
end
|
|
end
|
|
|
|
# Image
|
|
class InlineImage < InlinePre
|
|
self.begin = /(?<!\\)!\[[^\]]+?\]\([^)]+?\)/
|
|
self.end = self.begin
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
text, link = text.match(/!\[([^\]]+?)\]\(([^)]+?)\)/)[1..2]
|
|
instance = super(text)
|
|
instance.property link: link
|
|
instance
|
|
end
|
|
end
|
|
|
|
# Linebreak
|
|
class InlineBreak < DOMObject
|
|
self.begin = / /
|
|
self.end = / /
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(_text)
|
|
new
|
|
end
|
|
|
|
# Stub for inline break length
|
|
def length
|
|
0
|
|
end
|
|
end
|
|
|
|
# Document root
|
|
class Document < DOMObject
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
merge(vacuum(super(normalize_newlines(text))))
|
|
end
|
|
|
|
# Replace all forms of line endings with UNIX format newline
|
|
def self.normalize_newlines(text)
|
|
text.gsub(/(?:\r\n|\n\r|\r|\n)/, "\n")
|
|
end
|
|
|
|
# Remove all elements with absolute lenght of 0
|
|
def self.vacuum(document)
|
|
delete = []
|
|
document.children.each_with_index do |element, index|
|
|
delete.append(index) unless element.length.positive?
|
|
end
|
|
delete.reverse_each do |index|
|
|
document.delete_at(index)
|
|
end
|
|
document
|
|
end
|
|
|
|
# Merge adjacent lists with indent blocks as per markdownguide guidelines
|
|
def self.merge(document)
|
|
last_list = nil
|
|
delete_deferred = []
|
|
document.children.each_with_index do |child, index|
|
|
if !last_list and [::RBMark::DOM::ULBlock,
|
|
::RBMark::DOM::OLBlock].include? child.class
|
|
last_list = child
|
|
elsif last_list and mergeable?(last_list, child)
|
|
merge_adjacent(last_list, child)
|
|
delete_deferred.append(index)
|
|
else
|
|
last_list = nil
|
|
end
|
|
end
|
|
delete_deferred.reverse_each { |index| document.delete_at(index) }
|
|
document
|
|
end
|
|
|
|
# Check if 2 elements can be merged
|
|
def self.mergeable?(list, child)
|
|
if child.is_a? ::RBMark::DOM::IndentBlock or
|
|
(child.is_a? ::RBMark::DOM::ULBlock and
|
|
list.is_a? ::RBMark::DOM::ULBlock) or
|
|
(child.is_a? ::RBMark::DOM::OLBlock and
|
|
list.is_a? ::RBMark::DOM::OLBlock and
|
|
child.children.first.properties[:number] >
|
|
list.children.last.properties[:number])
|
|
true
|
|
else
|
|
false
|
|
end
|
|
end
|
|
|
|
# Merge 2 elements
|
|
def self.merge_adjacent(left, right)
|
|
if right.is_a? ::RBMark::DOM::ULBlock or
|
|
right.is_a? ::RBMark::DOM::OLBlock
|
|
right.children.each do |child|
|
|
left.append(child)
|
|
end
|
|
elsif right.is_a? ::RBMark::DOM::IndentBlock
|
|
left.children.last.append(
|
|
*::RBMark::DOM::ListElement.parse(right.children.first.content)
|
|
.children
|
|
)
|
|
end
|
|
end
|
|
end
|
|
|
|
# Paragraph in a document (separated by 2 newlines)
|
|
class Paragraph < InlineFormattable
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
# @return [Array<self>]
|
|
def self.parse(text)
|
|
text.split("\n\n").map do |chunk|
|
|
super(chunk)
|
|
end
|
|
end
|
|
end
|
|
|
|
# Heading level 1
|
|
class Heading1 < InlineFormattable
|
|
self.begin = /^# /
|
|
self.end = /#?$/
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
super(text.gsub(self.begin, '').gsub(self.end, ''))
|
|
end
|
|
end
|
|
|
|
# Heading level 2
|
|
class Heading2 < Heading1
|
|
self.begin = /^## /
|
|
self.end = /(?:##)?$/
|
|
end
|
|
|
|
# Heading level 3
|
|
class Heading3 < Heading1
|
|
self.begin = /^### /
|
|
self.end = /(?:###)?$/
|
|
end
|
|
|
|
# Heading level 4
|
|
class Heading4 < Heading1
|
|
self.begin = /^#### /
|
|
self.end = /(?:####)?$/
|
|
end
|
|
|
|
# Heading level 5
|
|
class Heading5 < Heading1
|
|
self.begin = /^##### /
|
|
self.end = /(?:#####)?$/
|
|
end
|
|
|
|
# Heading level 6
|
|
class Heading6 < Heading1
|
|
self.begin = /^###### /
|
|
self.end = /(?:######)?$/
|
|
end
|
|
|
|
# Alternative heading 1
|
|
class AltHeading1 < InlineFormattable
|
|
self.begin = /^[^\n]+\n={3,}$/m
|
|
self.end = /={3,}$/
|
|
self.alt_for = ::RBMark::DOM::Heading1
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
super(text.match(/\A[^\n]+$/)[0].strip)
|
|
end
|
|
end
|
|
|
|
# Alternative heading 2
|
|
class AltHeading2 < InlineFormattable
|
|
self.begin = /^[^\n]+\n-{3,}$/m
|
|
self.end = /-{3,}$/
|
|
self.alt_for = ::RBMark::DOM::Heading2
|
|
|
|
# (see ::RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
super(text.match(/\A[^\n]+$/)[0].strip)
|
|
end
|
|
end
|
|
|
|
# Preformatted code block
|
|
class CodeBlock < DOMObject
|
|
self.begin = /^```[^\n]*$/
|
|
self.end = /^```[^\n]*\n.*?\n```$/m
|
|
|
|
# Stub parser for block text element
|
|
# @param text [String]
|
|
# @return [self]
|
|
def self.parse(text)
|
|
instance = new
|
|
language = text.match(/\A```([^\n]*)/)[1].strip
|
|
element = ::RBMark::DOM::Text.new
|
|
element.content = text.lines[1..-2].join('').rstrip
|
|
instance.append(element)
|
|
instance.property language: language
|
|
instance
|
|
end
|
|
end
|
|
|
|
# Quote block
|
|
class QuoteBlock < Document
|
|
self.begin = /^> \S/
|
|
self.end = /(?:^(?!>)|\Z)/
|
|
|
|
# stub
|
|
def self.parse(text)
|
|
super(text.lines.map { |x| x[2..] }.join(''))
|
|
end
|
|
end
|
|
|
|
# Table
|
|
class TableBlock < DOMObject
|
|
end
|
|
|
|
# List element
|
|
class ListElement < Document
|
|
end
|
|
|
|
# Unordered list
|
|
class ULBlock < DOMObject
|
|
self.begin = /^- +\S+/
|
|
self.end = /(?:^(?!- +\S+| )|\Z)/
|
|
|
|
# (see RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
block = []
|
|
instance = new
|
|
text.lines.each do |line|
|
|
if line.start_with?("- ")
|
|
unless block.empty?
|
|
instance.append(::RBMark::DOM::ListElement.parse(block.join('')))
|
|
end
|
|
block = [line[2..]]
|
|
else
|
|
block.append(line[2..])
|
|
end
|
|
end
|
|
instance.append(::RBMark::DOM::ListElement.parse(block.join('')))
|
|
instance
|
|
end
|
|
end
|
|
|
|
# Ordered list block
|
|
class OLBlock < DOMObject
|
|
self.begin = /^\d+\. +\S+/
|
|
self.end = /(?:^(?!\d+\. +\S+| {4})|\Z)/
|
|
|
|
# (see RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
block = []
|
|
instance = new
|
|
counter = nil
|
|
text.lines.each do |line|
|
|
if line.start_with?(/^\d+\. /)
|
|
unless block.empty?
|
|
instance.append(element(block.join(''), counter))
|
|
end
|
|
counter = line.match(/^(\d+)\. /)[1]
|
|
block = [line.gsub(/^(?:\d+\. | {4})/, '')]
|
|
else
|
|
block.append(line.gsub(/^(?:\d+\. | {4})/, ''))
|
|
end
|
|
end
|
|
instance.append(element(block.join(''), counter))
|
|
instance
|
|
end
|
|
|
|
# Construct a new ListElement
|
|
def self.element(text, counter)
|
|
instance = ::RBMark::DOM::ListElement.parse(text)
|
|
instance.property number: counter
|
|
instance
|
|
end
|
|
end
|
|
|
|
# Indent block
|
|
class IndentBlock < DOMObject
|
|
self.begin = /^ {4}/
|
|
self.end = /(?:^(?! {4})|\Z)/
|
|
|
|
# (see RBMark::DOM::DOMObject.parse)
|
|
def self.parse(text)
|
|
instance = new
|
|
element = ::RBMark::DOM::Text.new
|
|
element.content = text.lines.map { |x| x[4..] }.join('')
|
|
instance.append(element)
|
|
instance
|
|
end
|
|
end
|
|
|
|
# Horizontal rule
|
|
class HorizontalRule < DOMObject
|
|
self.begin = /^-{3,}$/
|
|
self.end = /$/
|
|
|
|
# stub for HR
|
|
def self.parse(_text)
|
|
new
|
|
end
|
|
|
|
# Stub for HR length
|
|
# @return [Integer]
|
|
def length
|
|
1
|
|
end
|
|
end
|
|
|
|
InlineFormattable.class_exec do
|
|
default ::RBMark::DOM::Text
|
|
variant ::RBMark::DOM::InlineBold
|
|
variant ::RBMark::DOM::InlineItalics
|
|
variant ::RBMark::DOM::InlineAltItalics
|
|
variant ::RBMark::DOM::InlineUnder
|
|
variant ::RBMark::DOM::InlineImage
|
|
variant ::RBMark::DOM::InlineLink
|
|
variant ::RBMark::DOM::InlinePre
|
|
variant ::RBMark::DOM::InlineStrike
|
|
variant ::RBMark::DOM::InlineBreak
|
|
end
|
|
|
|
Document.class_exec do
|
|
default ::RBMark::DOM::Paragraph
|
|
variant ::RBMark::DOM::Heading1
|
|
variant ::RBMark::DOM::Heading2
|
|
variant ::RBMark::DOM::Heading3
|
|
variant ::RBMark::DOM::Heading4
|
|
variant ::RBMark::DOM::Heading5
|
|
variant ::RBMark::DOM::Heading6
|
|
variant ::RBMark::DOM::AltHeading1
|
|
variant ::RBMark::DOM::AltHeading2
|
|
variant ::RBMark::DOM::QuoteBlock
|
|
variant ::RBMark::DOM::CodeBlock
|
|
variant ::RBMark::DOM::ULBlock
|
|
variant ::RBMark::DOM::OLBlock
|
|
variant ::RBMark::DOM::IndentBlock
|
|
variant ::RBMark::DOM::HorizontalRule
|
|
end
|
|
end
|
|
end
|