From 9169ed0185149f9a7afe9933e2686278d3d51cee Mon Sep 17 00:00:00 2001 From: Leo Robinovitch Date: Sat, 23 Dec 2023 16:42:43 -0800 Subject: [PATCH] allow downloading entire comics --- main.go | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 83 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index 93cf97d..838d4ee 100644 --- a/main.go +++ b/main.go @@ -9,11 +9,16 @@ import ( "io" "net/http" "os" + "regexp" + "sort" + "strconv" "strings" + "time" ) -func getImgLinks(url string) []string { +func getImgLinksForEpisode(url string) []string { resp, err := soup.Get(url) + time.Sleep(500 * time.Millisecond) if err != nil { fmt.Println(fmt.Sprintf("Error fetching page: %v", err)) os.Exit(1) @@ -23,11 +28,87 @@ func getImgLinks(url string) []string { var imgLinks []string for _, img := range imgs { - imgLinks = append(imgLinks, img.Attrs()["data-url"]) + if dataURL, ok := img.Attrs()["data-url"]; ok { + imgLinks = append(imgLinks, dataURL) + } } return imgLinks } +func getEpisodeLinksForPage(url string) ([]string, error) { + resp, err := soup.Get(url) + time.Sleep(500 * time.Millisecond) + if err != nil { + return []string{}, fmt.Errorf("error fetching page: %v", err) + } + doc := soup.HTMLParse(resp) + episodeURLs := doc.Find("div", "class", "detail_lst").FindAll("a") + var links []string + for _, episodeURL := range episodeURLs { + if href := episodeURL.Attrs()["href"]; strings.Contains(href, "/viewer") { + links = append(links, href) + } + } + return links, nil +} + +func getImgLinks(url string) []string { + if strings.Contains(url, "/viewer") { + // assume viewing single episode + return getImgLinksForEpisode(url) + } else { + // assume viewing list of episodes + re := regexp.MustCompile("&page=[0-9]+") + allEpisodeLinks := make(map[string]struct{}) + foundLastPage := false + for page := 1; !foundLastPage; page++ { + url = re.ReplaceAllString(url, "") + fmt.Sprintf("&page=%d", page) + episodeLinks, err := getEpisodeLinksForPage(url) + if err != nil { + break + } + for _, episodeLink := range episodeLinks { + // when you go past the last page, it just rerenders the last page + if _, ok := allEpisodeLinks[episodeLink]; ok { + foundLastPage = true + break + } + allEpisodeLinks[episodeLink] = struct{}{} + } + if !foundLastPage { + println(url) + } + } + keys := make([]string, 0, len(allEpisodeLinks)) + for k := range allEpisodeLinks { + keys = append(keys, k) + } + // extract episode_no from url and sort by it + re = regexp.MustCompile("episode_no=([0-9]+)") + episodeNo := func(episodeLink string) int { + matches := re.FindStringSubmatch(episodeLink) + if len(matches) != 2 { + return 0 + } + episodeNo, err := strconv.Atoi(matches[1]) + if err != nil { + return 0 + } + return episodeNo + } + sort.Slice(keys, func(i, j int) bool { + return episodeNo(keys[i]) < episodeNo(keys[j]) + }) + + var allImgLinks []string + for _, episodeLink := range keys { + println(episodeLink) + allImgLinks = append(allImgLinks, getImgLinksForEpisode(episodeLink)...) + } + return allImgLinks + } +} + func fetchImage(imgLink string) []byte { req, e := http.NewRequest("GET", imgLink, nil) if e != nil {